1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | import re import sys import time import httplib import datetime import itertools # You'll need to install the simplejson module # http://pypi.python.org/pypi/simplejson import simplejson # This regular expression is the heart of the code. # Python uses Perl regex, so it should be readily portable # The r'' string form is just a convenience so you don't have to escape backslashes COMBINED_LOGLINE_PAT = re.compile( r'(?P<origin>\d+\.\d+\.\d+\.\d+) ' + r'(?P<identd>-|\w*) (?P<auth>-|\w*) ' + r'\[(?P<ts>(?P<date>[^\[\]:]+) ![]() + r'"(?P<method>\w+) (?P<path>[\S]+) (?P<protocol>[^"]+)" (?P<status>\d+) (?P<bytes>-|\d+)' + r'( (?P<referrer>"[^"]*")( (?P<client>"[^"]*")( (?P<cookie>"[^"]*"))?)?)?\s*\Z' ) # Patterns in the client field for sniffing out bots BOT_TRACES = [ (re.compile(r".*http://help\.yahoo\.com/help/us/ysearch/slurp.*"), "Yahoo robot"), (re.compile(r".*\+http://www\.google\.com/bot\.html.*"), "Google robot"), (re.compile(r".*\+http://about\.ask\.com/en/docs/about/webmasters.shtml.*"), "Ask Jeeves/Teoma robot"), (re.compile(r".*\+http://search\.msn\.com\/msnbot\.htm.*"), "MSN robot"), (re.compile(r".*http://www\.entireweb\.com/about/search_tech/speedy_spider/.*"), "Speedy Spider"), (re.compile(r".*\+http://www\.baidu\.com/search/spider_jp\.html.*"), "Baidu spider"), (re.compile(r".*\+http://www\.gigablast\.com/spider\.html.*"), "Gigabot robot"), ] MAXRECORDS = 1000 # Apache's date/time format is very messy, so dealing with it is messy # This class provides support for managing timezones in the Apache time field # Reuses some code from: http://seehuhn.de/blog/52 class timezone(datetime.tzinfo): def __init__(self, name="+0000"): self.name = name seconds = int(name[:-2])*3600+int(name[-2:])*60 self.offset = datetime.timedelta(seconds=seconds) def utcoffset(self, dt): return self.offset def dst(self, dt): return timedelta(0) def tzname(self, dt): return self.name def parse_apache_date(date_str, tz_str): ''' Parse the timestamp from the Apache log file, and return a datetime object ''' tt = time.strptime(date_str, "%d/%b/%Y:%H:%M:%S") tt = tt[:6] + (0, timezone(tz_str)) return datetime.datetime(*tt) def bot_check(match_info): ''' Return True if the matched line looks like a robot ''' for pat, botname in BOT_TRACES: if pat.match(match_info.group('client')): return True break return False entries = [] # enumerate lets you iterate over the lines in the file, maintaining a count variable # itertools.islice lets you iterate over only a subset of the lines in the file for count, line in enumerate(itertools.islice(sys.stdin, 0, MAXRECORDS)): match_info = COMBINED_LOGLINE_PAT.match(line) if not match_info: sys.stderr.write("Unable to parse log line\n") continue # If you want to include robot clients, comment out the next two lines if bot_check(match_info): continue entry = {} timestamp = parse_apache_date(match_info.group('ts'), match_info.group('tz')) timestamp_str = timestamp.isoformat() # To make Exhibit happy, set id and label fields that give some information # about the entry, but are unique across all entries (ensured by appending count) entry['id'] = match_info.group('origin') + ':' + timestamp_str + ':' + str(count) entry['label'] = entry['id'] entry['origin'] = match_info.group('origin') entry['timestamp'] = timestamp_str entry['path'] = match_info.group('path') entry['method'] = match_info.group('method') entry['protocol'] = match_info.group('protocol') entry['status'] = match_info.group('status') entry['status'] += ' ' + httplib.responses[int(entry['status'])] if match_info.group('bytes') != '-': entry['bytes'] = match_info.group('bytes') if match_info.group('referrer') != '"-"': entry['referrer'] = match_info.group('referrer') entry['client'] = match_info.group('client') entries.append(entry) print simplejson.dumps({'items': entries}, indent=4) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | { "items": [ { "origin": "208.111.154.16", "status": "200 OK", "protocol": "HTTP/1.1", "timestamp": "2009-04-27T08:21:42-05:00", "bytes": "2638", "auth": "-", "label": "208.111.154.16:2009-04-27T08:21:42-05:00:2", "identd": "-", "method": "GET", "client": "Mozilla/5.0 (compatible; Charlotte/1.1; http://www.searchme.com/support/)", "referrer": "-", "path": "/uche.ogbuji.net", "id": "208.111.154.16:2009-04-27T08:21:42-05:00:2" }, { "origin": "65.103.181.249", "status": "200 OK", "protocol": "HTTP/1.1", "timestamp": "2009-04-27T09:11:54-05:00", "bytes": "6767", "auth": "-", "label": "65.103.181.249:2009-04-27T09:11:54-05:00:4", "identd": "-", "method": "GET", "client": "Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php?+)", "referrer": "-", "path": "/", "id": "65.103.181.249:2009-04-27T09:11:54-05:00:4" } ] } |
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) | Powered by Discuz! 7.0.0 |