| 1 |
|
|---|
| 2 |
from apache_log.parser import ApacheLogParser |
|---|
| 3 |
from apache_log.bot_hosts import BOT_DOMAINS |
|---|
| 4 |
from apache_log.resolv_ip import ResolvIP |
|---|
| 5 |
from sys import argv, exit, stderr |
|---|
| 6 |
from IPy import IP |
|---|
| 7 |
import re |
|---|
| 8 |
from apache_log.isp import ISP_DOMAINS |
|---|
| 9 |
from optparse import OptionParser |
|---|
| 10 |
|
|---|
| 11 |
|
|---|
| 12 |
LONG_DOMAINS = ('com', 'co', 'ne', 'eu', 'ac', 'net', 'edu', 'gouv', 'qc', 'uni') |
|---|
| 13 |
|
|---|
| 14 |
class Host: |
|---|
| 15 |
def __init__(self, ip, name): |
|---|
| 16 |
self.ip = ip |
|---|
| 17 |
self.name = name |
|---|
| 18 |
self.referrers = set() |
|---|
| 19 |
self.hits = 0 |
|---|
| 20 |
self.fullnames = set() |
|---|
| 21 |
|
|---|
| 22 |
def hit(self, referrer, fullname): |
|---|
| 23 |
self.hits += 1 |
|---|
| 24 |
if fullname: |
|---|
| 25 |
self.fullnames.add(fullname) |
|---|
| 26 |
if referrer: |
|---|
| 27 |
self.referrers.add(referrer) |
|---|
| 28 |
|
|---|
| 29 |
class HostList(ApacheLogParser): |
|---|
| 30 |
def __init__(self, syntax, website_url): |
|---|
| 31 |
ApacheLogParser.__init__(self, syntax) |
|---|
| 32 |
self.website_url = website_url |
|---|
| 33 |
self.hosts = {} |
|---|
| 34 |
self.resolver = ResolvIP() |
|---|
| 35 |
self.exclude_referrers = re.compile('^'+re.escape(website_url)) |
|---|
| 36 |
self.exclude_networks = [IP('127.0.0.0/8'), IP('192.168.0.0/16')] |
|---|
| 37 |
self.exclude_domains = [] |
|---|
| 38 |
self.countries = set() |
|---|
| 39 |
|
|---|
| 40 |
def cleanupName(self, name): |
|---|
| 41 |
lower = name.lower().split('.') |
|---|
| 42 |
parts = name.split('.') |
|---|
| 43 |
nb = len(lower) |
|---|
| 44 |
truncate = 2 |
|---|
| 45 |
if 2 < len(lower) and lower[-2] in LONG_DOMAINS: |
|---|
| 46 |
truncate = 3 |
|---|
| 47 |
if truncate < nb: |
|---|
| 48 |
name = ('.'.join(['www']+parts[-truncate:])).lower() |
|---|
| 49 |
country = parts[-1].lower() |
|---|
| 50 |
else: |
|---|
| 51 |
country = None |
|---|
| 52 |
return name, country |
|---|
| 53 |
|
|---|
| 54 |
def ipName(self, ip): |
|---|
| 55 |
addr = str(ip) |
|---|
| 56 |
do_resolv = (addr not in self.resolver) |
|---|
| 57 |
if do_resolv: |
|---|
| 58 |
print >>stderr, "Resolv name of IP %s..." % addr |
|---|
| 59 |
name = self.resolver[addr] |
|---|
| 60 |
if name and do_resolv: |
|---|
| 61 |
print >>stderr, "Resolv name of IP %s... %r" % (addr, name) |
|---|
| 62 |
return name |
|---|
| 63 |
|
|---|
| 64 |
def processRequest(self, request): |
|---|
| 65 |
ip = request.host |
|---|
| 66 |
if not ip: |
|---|
| 67 |
return |
|---|
| 68 |
for network in self.exclude_networks: |
|---|
| 69 |
if ip in network: |
|---|
| 70 |
return |
|---|
| 71 |
|
|---|
| 72 |
fullname = self.ipName(ip) |
|---|
| 73 |
if not fullname: |
|---|
| 74 |
return |
|---|
| 75 |
|
|---|
| 76 |
|
|---|
| 77 |
name, country = self.cleanupName(fullname) |
|---|
| 78 |
if country: |
|---|
| 79 |
self.countries.add(country) |
|---|
| 80 |
|
|---|
| 81 |
|
|---|
| 82 |
for domain_regex in self.exclude_domains: |
|---|
| 83 |
if domain_regex.search(fullname): |
|---|
| 84 |
return |
|---|
| 85 |
|
|---|
| 86 |
if fullname == name: |
|---|
| 87 |
fullname = None |
|---|
| 88 |
key = name |
|---|
| 89 |
|
|---|
| 90 |
if key in self.hosts: |
|---|
| 91 |
host = self.hosts[key] |
|---|
| 92 |
else: |
|---|
| 93 |
host = Host(ip, name) |
|---|
| 94 |
self.hosts[key] = host |
|---|
| 95 |
referrer = request.referrer |
|---|
| 96 |
if referrer and self.exclude_referrers.search(referrer): |
|---|
| 97 |
referrer = None |
|---|
| 98 |
host.hit(referrer, fullname) |
|---|
| 99 |
|
|---|
| 100 |
def createLink(url, text=None, max_length=40): |
|---|
| 101 |
if not url: |
|---|
| 102 |
return '-' |
|---|
| 103 |
if not text: |
|---|
| 104 |
text = url |
|---|
| 105 |
if max_length < len(text): |
|---|
| 106 |
text = text[:max_length] + '...' |
|---|
| 107 |
return '<a href="%s">%s</a>' % (url, text) |
|---|
| 108 |
|
|---|
| 109 |
def htmlReport(parser): |
|---|
| 110 |
|
|---|
| 111 |
hosts = parser.hosts.values() |
|---|
| 112 |
hosts.sort(key=lambda host: host.hits, reverse=True) |
|---|
| 113 |
|
|---|
| 114 |
|
|---|
| 115 |
print '<html><body><table border="1">' |
|---|
| 116 |
print "<h1>Statistics of %s</h1>" % parser.website_url |
|---|
| 117 |
|
|---|
| 118 |
if parser.first_datetime: |
|---|
| 119 |
print "<p>First timestamp : <em>%s</em></p>" % parser.first_datetime |
|---|
| 120 |
if parser.last_datetime: |
|---|
| 121 |
print "<p>Last timestamp : <em>%s</em></p>" % parser.last_datetime |
|---|
| 122 |
if parser.first_datetime and parser.last_datetime: |
|---|
| 123 |
duration = parser.last_datetime - parser.first_datetime |
|---|
| 124 |
print "<p>Duration : <em>%s</em></p>" % duration |
|---|
| 125 |
|
|---|
| 126 |
countries = list(parser.countries) |
|---|
| 127 |
if countries: |
|---|
| 128 |
countries.sort() |
|---|
| 129 |
print "<p>Countries (%s) : %s</p>" % ( |
|---|
| 130 |
len(countries), ", ".join(countries)) |
|---|
| 131 |
|
|---|
| 132 |
data = ('Hits', 'Host', 'Full name', 'Referrers') |
|---|
| 133 |
print '<tr>%s</tr>' % ''.join('<th>%s</th>' % item for item in data) |
|---|
| 134 |
for host in hosts: |
|---|
| 135 |
referrers = host.referrers |
|---|
| 136 |
fullnames = list(host.fullnames) |
|---|
| 137 |
if fullnames: |
|---|
| 138 |
fullnames.sort() |
|---|
| 139 |
fullnames = "<br>".join(fullnames) |
|---|
| 140 |
else: |
|---|
| 141 |
fullnames = '-' |
|---|
| 142 |
referrers = "<br>".join( createLink(url) for url in referrers ) |
|---|
| 143 |
if not referrers: |
|---|
| 144 |
referrers = '-' |
|---|
| 145 |
name = host.name |
|---|
| 146 |
if name: |
|---|
| 147 |
name = createLink('http://'+name, name) |
|---|
| 148 |
else: |
|---|
| 149 |
name = host.ip |
|---|
| 150 |
data = (host.hits, name, fullnames, referrers) |
|---|
| 151 |
print '<tr>%s</tr>' % ''.join('<td>%s</td>' % item for item in data) |
|---|
| 152 |
print '</table></body></html>' |
|---|
| 153 |
|
|---|
| 154 |
def parseOptions(): |
|---|
| 155 |
SYNTAX = "{host} - {user} {date} {request} {answer} {referrer} {user_agent}" |
|---|
| 156 |
|
|---|
| 157 |
parser = OptionParser(usage="%prog [options] filename.log http://website.url") |
|---|
| 158 |
parser.add_option("--ignore-isp", help="Ignore ISP", |
|---|
| 159 |
action="store_true") |
|---|
| 160 |
parser.add_option("--ignore-bots", help="Ignore bots", |
|---|
| 161 |
action="store_true") |
|---|
| 162 |
parser.add_option("--syntax", help="Apache log syntax (default: %r)" % SYNTAX, |
|---|
| 163 |
type="str", default=SYNTAX) |
|---|
| 164 |
|
|---|
| 165 |
options, arguments = parser.parse_args() |
|---|
| 166 |
if len(arguments) != 2: |
|---|
| 167 |
parser.print_help() |
|---|
| 168 |
exit(1) |
|---|
| 169 |
return options, arguments |
|---|
| 170 |
|
|---|
| 171 |
def main(): |
|---|
| 172 |
try: |
|---|
| 173 |
options, arguments = parseOptions() |
|---|
| 174 |
filename, website_url = arguments |
|---|
| 175 |
syntax = options.syntax |
|---|
| 176 |
|
|---|
| 177 |
parser = HostList(syntax, website_url) |
|---|
| 178 |
if options.ignore_isp: |
|---|
| 179 |
parser.exclude_domains += ISP_DOMAINS |
|---|
| 180 |
if options.ignore_bots: |
|---|
| 181 |
parser.exclude_domains += BOT_DOMAINS |
|---|
| 182 |
|
|---|
| 183 |
parser.parseFile(filename) |
|---|
| 184 |
htmlReport(parser) |
|---|
| 185 |
except KeyboardInterrupt: |
|---|
| 186 |
print "Interrupt!" |
|---|
| 187 |
|
|---|
| 188 |
if __name__ == "__main__": |
|---|
| 189 |
main() |
|---|
| 190 |
|
|---|