root/apache_log/apache_log/bot_hosts.py

Revision 327, 1.2 kB (checked in by haypo, 2 months ago)

Add msn bot

  • Property svn:eol-style set to native
Line 
1 from IPy import IP
2 import re
3
4 BOT_HOSTS = (
5     # Google search engine hosts
6     r"64\.233\.183\.104",
7     r"64\.233\.169\.104",
8     r"66\.102\.9\.104",
9     r"72\.14\.253\.104",
10     r"209\.85\.135\.104",
11     r"209\.85\.129\.104",
12     r"(images|www)\.google\.[a-z]{2,3}",
13
14     # Popular search engines
15     r"www.altavista.com",
16     r"[a-z]+.search.yahoo.com",
17     r"search\.msn\.com", "search\.live\.com",
18
19     # Other search engines
20     r"vivisimo\.com",
21     r"search[0-9]-[0-9].free.fr",
22     r"search.ke.voila.fr",
23 )
24
25 BOT_IPS = (
26     IP('82.99.30.0/25'),    # IM Teknik AB (?)
27     IP('216.10.74.0/25'),  # ohloh.net
28 )
29
30 BOT_DOMAINS = [ re.compile(re.escape(regex)+'$') for regex in (
31     # Search engine crawlers (domain as string)
32     '.crawl.yahoo.net',   # Yahoo
33     '.search.live.com',   # live.com (Microsoft)
34     '.search.msn.com',    # search.msn.com
35     '.googlebot.com',     # Google
36     '.google.com',        # Google
37     '.exabot.com',        # Exabot
38     'crawler.bloglines.com',
39     '.cuill.com',
40     '.picsearch.com',
41     '.fastsearch.net',
42     '.rambler.ru',
43     '.cazoodle.com',
44 )] + [ re.compile(regex) for regex in (
45     # Domain regex
46     r'^crawler[0-9]{2}\.kaist\.ac\.kr$',
47 )]
48
Note: See TracBrowser for help on using the browser.