root/apache_log/apache_log/user_agents.py

Revision 139, 6.0 kB (checked in by haypo, 10 months ago)

apache_log:

  • maj liste ISP
  • parser.py: desactiver parseur d'Uger-Agent
  • host.py: cree l'option --ignore-bots
  • Property svn:eol-style set to native
Line 
1 from apache_log.tags import (TAG_CRAWLER, TAG_DOWNLOADER, TAG_BOT, TAG_RSS,
2     TAG_SUBVERSION, TAG_BROWSER, TAG_EMPTY, TAG_RANDOM)
3 import re
4 from apache_log.mozilla import MOZILLA_PREFIX_REGEX, parseMozillaUserAgent
5
6 CRAWLER_REGEX = re.compile('^(%s)' % '|'.join((
7     "msnbot/",                   # http://www.live.com/
8     "Speedy Spider",             # http://www.entireweb.com/
9     "Baiduspider",               # http://www.baidu.com/
10     "msnbot-media/",             # http://www.live.com/
11     "Yeti/",
12     "shelob v1",
13     "lwp-trivial/",
14     "LWP::Simple/",
15     "OrangeSpider",
16     "Yahoo-MMCrawler/",          # http://www.yahoo.com/
17     "holmes/",
18     "Googlebot-(Image|Video)",   # http://www.google.com/
19     "Krugle/",                   # http://www.krugle.com/
20     "FuseBulb.Com",              # http://www.fusebulb.com/
21     'FAST MetaWeb Crawler',      # http://www.fastsearch.com/
22     'KDDI-CA23 UP.Browser/',
23     'Y!J-SRD/',
24     'FAST Enterprise Crawler',   # used by Virk.dk - udvikling (thomas.bentzen@capgemini.com)
25     'Nokia6682/2\.0 \(3\.01\.1\) SymbianOS/8\.0 Series60/2\.6 Profile/MIDP-2\.0 configuration/CLDC-1\.1 UP\.Link/6\.3\.0\.0\.0 \(compatible;YahooSeeker/',
26     'WebImages 0\.3',            # http://herbert.groot.jebbink.nl/?app=WebImages
27     'webcollage/',
28     "gsa-crawler ",              # Enterprise; M2-PE3CGVABCA2AB; dpeterka@operative.com
29     "Pete-Spider Light/1\.0",
30     "MSRBOT ",                   # http://research.microsoft.com/research/sv/msrbot/
31     "TMCrawler$",
32     "appie 1\.1 ",               # http://www.walhello.com
33     'Wells Search II$',
34     'NG/2\.0$',
35     'LiteFinder/',               # http://www.litefinder.net/about.html
36     'Nokia6230i/\. FAST Crawler$',
37     'Bookdog/',
38     'A WinHTTP Example Program',
39     'wadaino.jp-crawler ',       # http://wadaino.jp/
40     'Yandex/',
41     'Teemer ',                   # http://www.netseer.com/crawler.html
42     'MSNBOT_Mobile ',
43     'Trailfire-bot/',            # http://trailfire.com
44     'Wouah/',                    # http://www.wouah.eu
45     'travel-search',
46 )))
47
48 BOT_REGEX = re.compile('^(%s)' % '|'.join((
49     "freshmeat.net URI validator", # http://freshmeat.net/
50     "PortUrlChecker/",
51     "Debian uscan",              # http://dehs.alioth.debian.org/uscan.html
52     "VadixBot",
53     "NutchCVS/",
54     "larbin",
55     "ia_archiver",
56     "CazoodleBot/",
57     "woriobot",
58     "Attributor.comBot",
59     "PHP version tracker",
60     "Gigabot/",
61     "bot/",
62     "LargeSmall Crawler",
63     'StackRambler/',
64     "SurveyBot/",                # http://www.whois.sc/
65     "rbot http util",            # http://linuxbrit.co.uk/rbot/
66     'LinkChecker/',              # http://linkchecker.sourceforge.net/
67     'Gaisbot/',                  # http://gais.cs.ccu.edu.tw/robot.php
68     'HMSE_Robot$',
69     'Slurpy Verifier/',
70     'psbot/',                    # http://www.picsearch.com/bot.html
71     'disco/Nutch-',              # imagine@gmail.com ; nedrocks@gmail.com
72     'Trailfire/',                # http://lucene.apache.org/nutch/bot.html
73     'GurujiBot/',                # http://www.guruji.com/en/WebmasterFAQ.html
74     'Mozilla/4\.0 compatible FurlBot/Furl Search 2\.0 \(FurlBot;', # http://www.furl.net
75     'Lsearch/sondeur$',
76     'MJ12bot/',                  # http://majestic12.co.uk/bot.php
77     'Microsoft URL Control',
78     'Portsurvey/',               # FreeBSD ports using libwww-perl
79     'Mozilla/3\.0 \(compatible; Indy Library\)',
80     'nrsbot/',                   # loopip.com/robot.html
81     'WordPress/',
82     'nicebot$',
83 )))
84
85 DOWNLOADER_REGEX = re.compile('^(%s)' % '|'.join((
86     "FDM 2",
87     "EasyDL/",
88     "Wget/",
89     "wget$",
90     "libcurl-agent/",
91 )))
92
93 RSS_REGEX = re.compile('^(%s)' % '|'.join((
94     "Bloglines",
95     "Feedfetcher-Google",
96     "RSSOwl/",
97     "Akregator/",
98     "Netvibes",
99     "Vienna/",
100     "AppleSyndication/",
101     "Liferea/",
102     'ELI/',                 # DAUM RSS Robot, http://ws.daum.net/aboutkr.html
103     'FeedFetcher-Google',   # http://www.google.com/feedfetcher.html
104     'AideRSS/',             # aiderss.com
105     'Feedreader ',
106 )))
107
108 SUBVERSION_REGEX = re.compile(r"^(SVN/|SVNKit )")
109
110 BROWSER_REGEX = re.compile("^(%s)" % "|".join((
111     "Opera/",
112     "Avant Browser",
113     "w3m/",
114     "Lynx/",
115     "Netscape",
116     "DoCoMo/",
117     "CFNetwork/",
118     "Nokia6820/",
119     'Python-urllib/',
120     "Sleipnir/",
121     "Jakarta Commons-HttpClient",
122     "libwww-perl/",
123     "Apache",
124     'DataCha0s/',
125     'voyager/',
126     "Java/",
127     'Portal Manager',
128     'compatible$',
129     'Mozilla/[457]\.0$',
130     'mozilla 4\.0$',
131     'ELinks',
132     'Links ',
133     'POE-Component-Client-HTTP/',
134     'Mozilla/0\.91 Beta \(Windows\)$',
135     'Mozilla/3\.0 \(compatible\)$',
136     'Mozilla/3\.01 \(compatible;\)$',
137     'Mozilla/4\.5 \[en\] \(Win98; I\)$',   # Netscape 4.5 on Windows 98
138     'Mozilla/4\.5 \(compatible; HTTrack 3\.0x; Windows 98\)$',
139     'Mozilla/4\.8 \[en\] \(X11; U; Linux 2\.4\.20-8 i686\)$',
140     'T-Mobile Dash Mozilla/4\.0',
141     'Kazehakase/',
142     'HTC_P3300 Mozilla/',
143 )))
144
145 RANDOM_REGEX = re.compile('^(%s)$' % '|'.join((
146     "cpohlltsgtodxkdggpOh8x",
147     "osxsl cpwljvhyroXesvgnhmwjucblj evv",
148     'Prtmaldw hujcqopg yfklsatx',
149     'hahahhahhaaahaha',
150     'Rvxleb mwyxfnhb wybm',
151     'Sagd jqnvxh qpvxu',
152     'Cvbm fyihkar oxin',
153     'vsfq2ynoqPxamsaiieaqPnwkq',
154     'yisd0mlomenbacmlkkixiawf',
155     'vpsqkb2epevbejvjkhrnbg  eot2ojj hiq',
156     'tbms7dorslbgoupwptposdscdNbucimv vj7mpd',
157     'Qcdgjyexi obchu bpdnhrso',
158     'Exdcjqp knudlw ralzxbgmu',
159 )))
160
161 REGEX_TAGS = (
162     (CRAWLER_REGEX, TAG_CRAWLER),
163     (DOWNLOADER_REGEX, TAG_DOWNLOADER),
164     (BOT_REGEX, TAG_BOT),
165     (RSS_REGEX, TAG_RSS),
166     (SUBVERSION_REGEX, TAG_SUBVERSION),
167     (BROWSER_REGEX, TAG_BROWSER),
168     (RANDOM_REGEX, TAG_RANDOM),
169 )
170
171 def userAgentTag(error, text):
172     for regex, tag in REGEX_TAGS:
173         if regex.search(text):
174             return TAG_CRAWLER
175
176     # Empty user agent: '-'
177     if not text or text == "-":
178         return TAG_EMPTY
179
180     # Parse Mozilla user agent
181     match = MOZILLA_PREFIX_REGEX.match(text)
182     if match:
183         return parseMozillaUserAgent(error, match.group(1))
184
185     error("Unable to parse user agent: %r" % text)
186     return None
187
Note: See TracBrowser for help on using the browser.