root/apache_log/apache_log/host.py

Revision 139, 5.8 kB (checked in by haypo, 8 months ago)

apache_log:

  • maj liste ISP
  • parser.py: desactiver parseur d'Uger-Agent
  • host.py: cree l'option --ignore-bots
  • Property svn:eol-style set to native
Line 
1 #!/usr/bin/python -u
2 from apache_log.parser import ApacheLogParser
3 from apache_log.bot_hosts import BOT_DOMAINS
4 from apache_log.resolv_ip import ResolvIP
5 from sys import argv, exit, stderr
6 from IPy import IP
7 import re
8 from apache_log.isp import ISP_DOMAINS
9 from optparse import OptionParser
10
11 # Long domain (eg. "edu.tw")
12 LONG_DOMAINS = ('com', 'co', 'ne', 'eu', 'ac', 'net', 'edu', 'gouv', 'qc', 'uni')
13
14 class Host:
15     def __init__(self, ip, name):
16         self.ip = ip
17         self.name = name
18         self.referrers = set()
19         self.hits = 0
20         self.fullnames = set()
21
22     def hit(self, referrer, fullname):
23         self.hits += 1
24         if fullname:
25             self.fullnames.add(fullname)
26         if referrer:
27             self.referrers.add(referrer)
28
29 class HostList(ApacheLogParser):
30     def __init__(self, syntax, website_url):
31         ApacheLogParser.__init__(self, syntax)
32         self.website_url = website_url
33         self.hosts = {}
34         self.resolver = ResolvIP()
35         self.exclude_referrers = re.compile('^'+re.escape(website_url))
36         self.exclude_networks = [IP('127.0.0.0/8'), IP('192.168.0.0/16')]
37         self.exclude_domains = []
38         self.countries = set()
39
40     def cleanupName(self, name):
41         lower = name.lower().split('.')
42         parts = name.split('.')
43         nb = len(lower)
44         truncate = 2
45         if 2 < len(lower) and lower[-2] in LONG_DOMAINS:
46             truncate = 3
47         if truncate < nb:
48             name = ('.'.join(['www']+parts[-truncate:])).lower()
49             country = parts[-1].lower()
50         else:
51             country = None
52         return name, country
53
54     def ipName(self, ip):
55         addr = str(ip)
56         do_resolv = (addr not in self.resolver)
57         if do_resolv:
58             print >>stderr, "Resolv name of IP %s..." % addr
59         name = self.resolver[addr]
60         if name and do_resolv:
61             print >>stderr, "Resolv name of IP %s... %r" % (addr, name)
62         return name
63
64     def processRequest(self, request):
65         ip = request.host
66         if not ip:
67             return
68         for network in self.exclude_networks:
69             if ip in network:
70                 return
71
72         fullname = self.ipName(ip)
73         if not fullname:
74             return
75
76         # Country statistics
77         name, country = self.cleanupName(fullname)
78         if country:
79             self.countries.add(country)
80
81         # Exclude domain?
82         for domain_regex in self.exclude_domains:
83             if domain_regex.search(fullname):
84                 return
85
86         if fullname == name:
87             fullname = None
88         key = name
89
90         if key in self.hosts:
91             host = self.hosts[key]
92         else:
93             host = Host(ip, name)
94             self.hosts[key] = host
95         referrer = request.referrer
96         if referrer and self.exclude_referrers.search(referrer):
97             referrer = None
98         host.hit(referrer, fullname)
99
100 def createLink(url, text=None, max_length=40):
101     if not url:
102         return '-'
103     if not text:
104         text = url
105     if max_length < len(text):
106         text = text[:max_length] + '...'
107     return '<a href="%s">%s</a>' % (url, text)
108
109 def htmlReport(parser):
110     # Sort hosts by hits
111     hosts = parser.hosts.values()
112     hosts.sort(key=lambda host: host.hits, reverse=True)
113
114     # Create HTML document
115     print '<html><body><table border="1">'
116     print "<h1>Statistics of %s</h1>" % parser.website_url
117
118     if parser.first_datetime:
119         print "<p>First timestamp : <em>%s</em></p>" % parser.first_datetime
120     if parser.last_datetime:
121         print "<p>Last timestamp : <em>%s</em></p>" % parser.last_datetime
122     if parser.first_datetime and parser.last_datetime:
123         duration = parser.last_datetime - parser.first_datetime
124         print "<p>Duration : <em>%s</em></p>" % duration
125
126     countries = list(parser.countries)
127     if countries:
128         countries.sort()
129         print "<p>Countries (%s) : %s</p>" % (
130             len(countries), ", ".join(countries))
131
132     data = ('Hits', 'Host', 'Full name', 'Referrers')
133     print '<tr>%s</tr>' % ''.join('<th>%s</th>' % item for item in data)
134     for host in hosts:
135         referrers = host.referrers
136         fullnames = list(host.fullnames)
137         if fullnames:
138             fullnames.sort()
139             fullnames = "<br>".join(fullnames)
140         else:
141             fullnames = '-'
142         referrers = "<br>".join( createLink(url) for url in referrers )
143         if not referrers:
144             referrers = '-'
145         name = host.name
146         if name:
147             name = createLink('http://'+name, name)
148         else:
149             name = host.ip
150         data = (host.hits, name, fullnames, referrers)
151         print '<tr>%s</tr>' % ''.join('<td>%s</td>' % item for item in data)
152     print '</table></body></html>'
153
154 def parseOptions():
155     SYNTAX = "{host} - {user} {date} {request} {answer} {referrer} {user_agent}"
156
157     parser = OptionParser(usage="%prog [options] filename.log http://website.url")
158     parser.add_option("--ignore-isp", help="Ignore ISP",
159         action="store_true")
160     parser.add_option("--ignore-bots", help="Ignore bots",
161         action="store_true")
162     parser.add_option("--syntax", help="Apache log syntax (default: %r)" % SYNTAX,
163         type="str", default=SYNTAX)
164
165     options, arguments = parser.parse_args()
166     if len(arguments) != 2:
167         parser.print_help()
168         exit(1)
169     return options, arguments
170
171 def main():
172     try:
173         options, arguments = parseOptions()
174         filename, website_url = arguments
175         syntax = options.syntax
176
177         parser = HostList(syntax, website_url)
178         if options.ignore_isp:
179             parser.exclude_domains += ISP_DOMAINS
180         if options.ignore_bots:
181             parser.exclude_domains += BOT_DOMAINS
182
183         parser.parseFile(filename)
184         htmlReport(parser)
185     except KeyboardInterrupt:
186         print "Interrupt!"
187
188 if __name__ == "__main__":
189     main()
190
Note: See TracBrowser for help on using the browser.