root/apache_log/apache_log/parser.py

Revision 220, 8.7 kB (checked in by haypo, 6 months ago)

Fix parser for user agent with \"

  • Property svn:eol-style set to native
Line 
1 #!/usr/bin/python
2 # -*- coding: iso-8859-1 -*-
3 #
4 # Python script to parse Apache log file
5 # Creation: 21 septembre 2005
6 # Author: Victor Stinner
7
8 import re
9 from time import time
10 from sys import exit
11 from urllib import unquote
12 from datetime import datetime
13 from logging import warning
14 from IPy import IP
15 from apache_log.user_agents import userAgentTag
16 from apache_log.tags import DEFAULT_TAG
17 from sys import stdin, stderr
18
19 class ApacheRequest:
20     def __init__(self, parser, raw_string, data):
21         self.parser = parser
22         self.raw_string = raw_string
23         self.tag = DEFAULT_TAG
24
25         self.method = data.get('method')           # str
26         self.origin = data.get('origin')           # str
27         self.size = data.get('size')               # int
28
29         if 'date' in data:
30             stamp = data['date']
31             if "+" in stamp:
32                 stamp = stamp[:-6]
33             self.timestamp = datetime.strptime(stamp, self.parser.date_format)
34         else:
35             self.timestamp = None
36
37         if 'url' in data:
38             self.url = self.unquote(data['url'])
39         else:
40             self.url = None
41         if 'host' in data:
42             self.host = IP(data['host'])
43         else:
44             self.host = None
45         if 'referrer' in data:
46             self.referrer = self.unquote(data['referrer'])
47         else:
48             self.referrer = None
49         self.user = data.get('user', None)
50         if self.user == '-':
51             self.user = None
52         if 'code' in data:
53             self.code = int(data['code'])
54         else:
55             self.code = None
56         if 'user_agent' in data:
57             self.user_agent = self.unquote(data['user_agent'])
58 #            tag = userAgentTag(parser.error, self.user_agent)
59 #            if tag:
60 #                self.tag = tag
61         else:
62             self.user_agent = None
63
64     def unquote(self, url):
65         url = url.replace(r'\"', '"')
66         url = unquote(url)
67         if url != "-":
68             return url
69         else:
70             return None
71
72 class ApacheLogParser:
73     _REGEX_STRING = r'(?:[^\"]|\")*'
74
75     # Referrer URL -> referrer="http://..."
76     REGEX_REFERRER = r'"(?P<referrer>%s)"' % _REGEX_STRING
77
78     # User agent: "Mozilla ..." -> user_agent="Mozilla ..."
79     REGEX_USER_AGENT = r'"(?P<user_agent>%s)"' % _REGEX_STRING
80
81     # HTTP request:
82     # "GET /index.html HTTP/1.1" -> method="GET", url="/index.html", http_version="1.1"
83     # "GET /" => method="GET", url="/", http_version is not set
84     HTTP_METHODS = ("POST", "GET", "HEAD")
85     WEBDAV_METHODS = (
86         "PUT", "COPY", "MERGE", "DELETE", "CHECKOUT",
87         "PROPFIND", "PROPPATCH", "CONNECT",
88         "MKACTIVITY", "MKCOL",
89         "REPORT", "OPTIONS",
90     )
91     METHODS = HTTP_METHODS + WEBDAV_METHODS
92     REGEX_REQUEST = r'"(?P<method>%s) (?P<url>[^ ]+)(?: HTTP/(?P<http_version>1\.[01]))?"' % '|'.join(METHODS)
93
94     # Date -> date="..."
95     REGEX_DATE = r"\[(?P<date>[^]]+)\]"
96
97     # Username: "haypo", "-" -> user="haypo"
98     REGEX_USER = r'(?P<user>(?:[a-z]+|-|""))'
99
100     # Origin -> origin="..."
101     REGEX_ORIGIN = r"(?P<origin>[^ ]+)"
102
103     # Host -> host="localhost"
104     REGEX_HOST = r"(?P<host>[^ ]+)"
105
106     # Code, Size: "200 -" --> code="200", size="-"
107     REGEX_ANSWER = r"(?P<code>[0-9]{3}) (?P<size>[0-9]+|-)"
108
109     MAX_ERROR = 20
110
111     def __init__(self, syntax):
112         def replace(regs):
113             name = regs.group(1)
114             return getattr(self, "REGEX_" + name.upper())
115         regex = "^%s$" % re.sub("{([^}]+)}", replace, syntax)
116         self.date_format = '%d/%b/%Y:%H:%M:%S'
117         self.filename = None
118         self.line_number = None
119         self._regex = re.compile(regex)
120         self.skip_error = True
121         self.nb_error = 0
122         self.first_datetime = None
123         self.last_datetime = None
124
125     def _processRequest(self, request):
126         if request.timestamp:
127             stamp = request.timestamp
128             if self.first_datetime:
129                 self.first_datetime = min(self.first_datetime, stamp)
130                 self.last_datetime = max(self.last_datetime, stamp)
131             else:
132                 self.first_datetime = stamp
133                 self.last_datetime = stamp
134         self.processRequest(request)
135
136     def processRequest(self, request):
137         raise NotImplemented()
138
139     def error(self, message):
140         message = "%s:%s: %s" % (self.filename, self.line_number, message)
141         if not self.skip_error or self.MAX_ERROR <= self.nb_error:
142             raise SyntaxError(message)
143         else:
144             warning(message)
145         self.nb_error += 1
146
147     def parseLine(self, line):
148         try:
149             match = self._regex.search(line)
150             if not match:
151                 self.error("Unable to parse: %s" % line)
152                 return
153             request = ApacheRequest(self, line, match.groupdict())
154             self._processRequest(request)
155         except KeyboardInterrupt:
156             print "Interrupt line: %s" % line
157             raise
158
159     def parseFile(self, filename):
160         self.filename = filename
161
162         self.line_number = 0
163         try:
164             if filename == "-":
165                 print >>stderr, "Read stdin..."
166                 input = stdin
167                 input.flush()
168             else:
169                 print >>stderr, "Load file %s ..." % filename
170                 input = open(self.filename, "r")
171             while True:
172                 line = input.readline()
173                 if not line:
174                     break
175                 self.line_number += 1
176                 line = line.rstrip()
177                 if not line:
178                     continue
179                 self.parseLine(line)
180             print >>stderr, "File %s parsed (%s lines)." % (self.filename, self.line_number)
181         except KeyboardInterrupt:
182             print >>stderr, "Load interrupted (CTRL+C)."
183             exit(1)
184
185 class HitCounter(dict):
186     def hit(self, key):
187         if key in self:
188             self[key] += 1
189         else:
190             self[key] = 1
191
192 def acceptAll(request):
193     return None
194
195 class ApacheLogParser_Stat(ApacheLogParser):
196     def __init__(self, syntax, host):
197         ApacheLogParser.__init__(self, syntax)
198         self.host = host
199         self.regex_host = re.compile("^"+host+"(.*)$")
200         self.stat_page = HitCounter()
201         self.stat_host = HitCounter()
202         self.stat_referrer = HitCounter()
203         self.ignore_handler = acceptAll
204         self.ignore_reasons = HitCounter()
205         self.raw_total_hits = 0
206         self.total_hits = 0
207         self.tag_hits = HitCounter()
208         self.raw_hits = HitCounter()
209
210     def hit(self, attr, key):
211         if key in attr:
212             attr[key] = attr[key] + 1
213         else:
214             attr[key] = 1
215
216     def ignoreErrorCode(self, code):
217         if 200 <= code <= 299:
218             return False
219         if 300 <= code <= 599:
220             return False
221         raise SyntaxError("Unknown HTTP code: %s" % code)
222
223     def processRequest(self, request):
224         # Update raw statistics
225         self.raw_total_hits += 1
226         self.raw_hits.hit(request.tag)
227
228         # Clean url (remove hostname)
229         match = self.regex_host.match(request.url)
230         if match:
231             request.url = "/"+m.group(1)
232
233         # Ignore this hit?
234         reason = self.ignore_handler(request)
235         if reason:
236             self.ignore_reasons.hit(reason)
237             return
238
239         # Skip redirection and error pages
240         if self.ignoreErrorCode(request.code):
241             return
242
243         self.stat_page.hit(request.url)
244         self.stat_host.hit(request.host)
245         if request.referrer:
246             self.stat_referrer.hit(request.referrer)
247
248         # Accept request
249         self.total_hits += 1
250         self.tag_hits.hit(request.tag)
251
252     def _sort_value(self, val1, val2):
253         if val1 > val2:
254             return -1
255         elif val1 == val2:
256             return 0
257         else:
258             return 1
259
260     def _sort_stat_referrer(self, key1, key2):
261         return self._sort_value(self.stat_referrer[key1],  self.stat_referrer[key2])
262
263     def _sort_stat_page(self, key1, key2):
264         return self._sort_value(self.stat_page[key1],  self.stat_page[key2])
265
266     def _sort_stat_host(self, key1, key2):
267         return self._sort_value(self.stat_host[key1],  self.stat_host[key2])
268
269     def getTopReferrer(self, max=30):
270         return self.getTop(self.stat_referrer, self._sort_stat_referrer, max)
271
272     def getTopPage(self, max=30):
273         return self.getTop(self.stat_page, self._sort_stat_page, max)
274
275     def getTopHost(self, max=30):
276         return self.getTop(self.stat_host, self._sort_stat_host, max)
277
278     def getTop(self, attr, sort_func, max=30):
279         keys = attr.keys()
280         keys.sort(sort_func)
281         top = []
282         cpt = 1
283         for key in keys:
284             top.append( (attr[key], key,) )
285             cpt += 1
286             if max <= cpt:
287                 break
288         return top
289
Note: See TracBrowser for help on using the browser.