Changeset 139
- Timestamp:
- 01/14/08 12:40:30 (10 months ago)
- Files:
-
- apache_log/apache_log/bot_hosts.py (modified) (2 diffs)
- apache_log/apache_log/host.py (modified) (3 diffs)
- apache_log/apache_log/isp.py (modified) (3 diffs)
- apache_log/apache_log/parser.py (modified) (1 diff)
- apache_log/apache_log/user_agents.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
apache_log/apache_log/bot_hosts.py
r125 r139 29 29 30 30 BOT_DOMAINS = [ re.compile(re.escape(regex)+'$') for regex in ( 31 # Search engine crawlers 31 # Search engine crawlers (domain as string) 32 32 '.crawl.yahoo.net', # Yahoo 33 33 '.search.live.com', # live.com (Microsoft) … … 41 41 '.rambler.ru', 42 42 '.cazoodle.com', 43 )] + [ re.compile(regex) for regex in ( 44 # Domain regex 45 r'^crawler[0-9]{2}\.kaist\.ac\.kr$', 43 46 )] 44 47 apache_log/apache_log/host.py
r126 r139 35 35 self.exclude_referrers = re.compile('^'+re.escape(website_url)) 36 36 self.exclude_networks = [IP('127.0.0.0/8'), IP('192.168.0.0/16')] 37 self.exclude_domains = BOT_DOMAINS37 self.exclude_domains = [] 38 38 self.countries = set() 39 39 … … 158 158 parser.add_option("--ignore-isp", help="Ignore ISP", 159 159 action="store_true") 160 parser.add_option("--ignore-bots", help="Ignore bots", 161 action="store_true") 160 162 parser.add_option("--syntax", help="Apache log syntax (default: %r)" % SYNTAX, 161 163 type="str", default=SYNTAX) … … 176 178 if options.ignore_isp: 177 179 parser.exclude_domains += ISP_DOMAINS 180 if options.ignore_bots: 181 parser.exclude_domains += BOT_DOMAINS 178 182 179 183 parser.parseFile(filename) apache_log/apache_log/isp.py
r125 r139 66 66 '.dip0.t-ipconnect.de', 67 67 '-fixip.tiscali.ch', 68 '.ppp.tiscali.fr', 68 69 '.access.telenet.be', 69 '.cust.tele2.it',70 70 '.dsl.lsan03.sbcglobal.net', 71 71 '.adsl.easynet.fr', 72 '.rev.coltfrance.com', 73 '.ripe.coltfrance.com', 74 '.rev.numericable.fr', 75 '.retail.telecomitalia.it', 76 '.d4.club-internet.fr', 77 '.dsl.completel.net', 78 'reverse.completel.net', 79 '.fix.bluewin.ch', 80 '.dsl.scarlet.be', 81 '.adsl.dyn.edpnet.net', 72 82 )] 73 83 … … 76 86 r'(?:proxy-[0-9]+|\.abo)\.wanadoo\.fr', 77 87 r'(?:proxy[a-z0-9-]{4,6}|\.fbx|\.adsl)\.proxad\.net', 78 r'\.(?:adsl|pck|pr 0)\.nerim\.net',88 r'\.(?:adsl|pck|pr[01]|net1|cnt|edu)\.nerim\.net', 79 89 r'08[0-9]{10}\.chello\.fr', 80 90 r'ip(?:-[0-9]{1,3}){4}\.asianetcom\.net', … … 86 96 r'\.(?:host|rev)\.[a-z]{2}\.colt\.net', 87 97 r'\.[a-z]{2}\.videotron\.ca', 98 r'\.cust\.tele2\.(?:it|fr)', 88 99 )] 89 100 apache_log/apache_log/parser.py
r120 r139 54 54 else: 55 55 self.code = None 56 if 'user_agent' in data:57 self.user_agent = data['user_agent']58 tag = userAgentTag(parser.error, self.user_agent)59 if tag:60 self.tag = tag61 else:62 self.user_agent = None56 # if 'user_agent' in data: 57 # self.user_agent = data['user_agent'] 58 # tag = userAgentTag(parser.error, self.user_agent) 59 # if tag: 60 # self.tag = tag 61 # else: 62 # self.user_agent = None 63 63 64 64 def unquote(self, url): apache_log/apache_log/user_agents.py
r123 r139 42 42 'MSNBOT_Mobile ', 43 43 'Trailfire-bot/', # http://trailfire.com 44 ))) 45 46 DOWNLOADER_REGEX = re.compile('^(%s)' % '|'.join(( 47 "FDM 2", 48 "EasyDL/", 49 "Wget/", 50 "wget$", 51 "libcurl-agent/", 44 'Wouah/', # http://www.wouah.eu 45 'travel-search', 52 46 ))) 53 47 … … 86 80 'nrsbot/', # loopip.com/robot.html 87 81 'WordPress/', 82 'nicebot$', 83 ))) 84 85 DOWNLOADER_REGEX = re.compile('^(%s)' % '|'.join(( 86 "FDM 2", 87 "EasyDL/", 88 "Wget/", 89 "wget$", 90 "libcurl-agent/", 88 91 ))) 89 92
