This file is indexed.

/usr/lib/python2.7/dist-packages/stetl/utils/apachelog.py is in python-stetl 1.1+ds-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#!/usr/bin/env python
"""Apache Log Parser

Parser for Apache log files. This is a port to python of Peter Hickman's
Apache::LogEntry Perl module:
<http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>

Takes the Apache logging format defined in your httpd.conf and generates
a regular expression which is used to a line from the log file and
return it as a dictionary with keys corresponding to the fields defined
in the log format.

Example:

    import apachelog, sys

    # Format copied and pasted from Apache conf - use raw string + single quotes
    format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'

    p = apachelog.parser(format)

    for line in open('/var/apache/access.log'):
        try:
           data = p.parse(line)
        except:
           sys.stderr.write("Unable to parse %s" % line)

The return dictionary from the parse method depends on the input format.
For the above example, the returned dictionary would look like;

    {
    '%>s': '200',
    '%b': '2607',
    '%h': '212.74.15.68',
    '%l': '-',
    '%r': 'GET /images/previous.png HTTP/1.1',
    '%t': '[23/Jan/2004:11:36:20 +0000]',
    '%u': '-',
    '%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
    '%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
    }

...given an access log entry like (split across lines for formatting);

    212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
        200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"

You can also re-map the field names by subclassing (or re-pointing) the
alias method.

Generally you should be able to copy and paste the format string from
your Apache configuration, but remember to place it in a raw string
using single-quotes, so that backslashes are handled correctly.

This module provides three of the most common log formats in the
formats dictionary;

    # Common Log Format (CLF)
    p = apachelog.parser(apachlog.formats['common'])

    # Common Log Format with Virtual Host
    p = apachelog.parser(apachlog.formats['vhcommon'])

    # NCSA extended/combined log format
    p = apachelog.parser(apachlog.formats['extended'])

For notes regarding performance while reading lines from a file
in Python, see <http://effbot.org/zone/readline-performance.htm>.
Further performance boost can be gained by using psyco
<http://psyco.sourceforge.net/>

On my system, using a loop like;

    for line in open('access.log'):
        p.parse(line)

...was able to parse ~60,000 lines / second. Adding psyco to the mix,
up that to ~75,000 lines / second.

The parse_date function is intended as a fast way to convert a log
date into something useful, without incurring a significant date
parsing overhead - good enough for basic stuff but will be a problem
if you need to deal with log from multiple servers in different
timezones.

JvdB:
From https://code.google.com/p/apachelog/
License: Artistic License/GPL
"""

__version__ = "1.1"
__license__ = """Released under the same terms as Perl.
See: http://dev.perl.org/licenses/
"""
__author__ = "Harry Fuecks <hfuecks@gmail.com>"
__contributors__ = [
    "Peter Hickman <peterhi@ntlworld.com>",
    "Loic Dachary <loic@dachary.org>"
]

import re
import hashlib


class ApacheLogParserError(Exception):
    pass


class parser:
    default_options = {'methods': ['GET', 'HEAD', 'POST'],
                       'use_native_types': True,
                       'request_path_only': True,
                       'gen_key': False}

    def __init__(self, format, key_map=None, options=default_options):
        """
        Takes the log format from an Apache configuration file.

        Best just copy and paste directly from the .conf file
        and pass using a Python raw string e.g.

        format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
        p = apachelog.parser(format)
        """
        self._names = []
        self._regex = None
        self._pattern = ''
        self._parse_format(format)
        self._key_map = key_map
        self._options = options

    def _parse_format(self, format):
        """
        Converts the input format to a regular
        expression, as well as extracting fields

        Raises an exception if it couldn't compile
        the generated regex.
        """
        format = format.strip()
        format = re.sub('[ \t]+', ' ', format)

        subpatterns = []

        findquotes = re.compile(r'^\\"')
        findreferreragent = re.compile('Referer|User-Agent')
        findpercent = re.compile('^%.*t$')
        lstripquotes = re.compile(r'^\\"')
        rstripquotes = re.compile(r'\\"$')
        self._names = []

        for element in format.split(' '):

            hasquotes = 0
            if findquotes.search(element):
                hasquotes = 1

            if hasquotes:
                element = lstripquotes.sub('', element)
                element = rstripquotes.sub('', element)

            self._names.append(self.alias(element))

            subpattern = '(\S*)'

            if hasquotes:
                if element == '%r' or findreferreragent.search(element):
                    subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
                else:
                    subpattern = r'\"([^\"]*)\"'

            elif findpercent.search(element):
                subpattern = r'(\[[^\]]+\])'

            elif element == '%U':
                subpattern = '(.+?)'

            subpatterns.append(subpattern)

        self._pattern = '^' + ' '.join(subpatterns) + '$'
        try:
            self._regex = re.compile(self._pattern)
        except Exception, e:
            raise ApacheLogParserError(e)

    def parse(self, line):
        """
        Parses a single line from the log file and returns
        a dictionary of it's contents.

        Raises and exception if it couldn't parse the line
        """
        line = line.strip()
        match = self._regex.match(line)

        if match:
            data = {}
            for k, v in zip(self._names, match.groups()):
                # JvdB convert to native Python types if needed
                if self._options['use_native_types']:
                    if k in ['%>s', '%b', '%D']:
                        try:
                            v = int(v)
                        except Exception:
                            v = 0
                    elif k == '%t':
                        try:
                            v = int(parse_date(v)[0])
                        except Exception:
                            v = 0
                    elif v == '-':
                        v = None

                # JvdB: elaborate request '%r' string
                if k == '%r':
                    v_elms = v.split(' ')

                    # Filter out methods of no interest
                    if v_elms[0] not in self._options['methods']:
                        return None

                    if self._options['request_path_only']:
                        try:
                            v = v.split(' ')[1]
                        except Exception:
                            v = ''

                # JvdB map %-like keys to readable names using key map
                if self._key_map:
                    try:
                        data[self._key_map[k]] = v
                    except KeyError:
                        pass
                else:
                    data[k] = v

            # JvdB option to generate unique key, e.g. for database insert
            if self._options['gen_key']:
                # Generate  unique key as md5-string from all values
                data['key'] = hashlib.md5(str(data.values())).hexdigest()

            return data

        raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % (line, self._pattern))

    def alias(self, name):
        """
        Override / replace this method if you want to map format
        field names to something else. This method is called
        when the parser is constructed, not when actually parsing
        a log file

        Takes and returns a string fieldname
        """
        return name

    def pattern(self):
        """
        Returns the compound regular expression the parser extracted
        from the input format (a string)
        """
        return self._pattern

    def names(self):
        """
        Returns the field names the parser extracted from the
        input format (a list)
        """
        return self._names


months = {
    'Jan': '01',
    'Feb': '02',
    'Mar': '03',
    'Apr': '04',
    'May': '05',
    'Jun': '06',
    'Jul': '07',
    'Aug': '08',
    'Sep': '09',
    'Oct': '10',
    'Nov': '11',
    'Dec': '12'
}


def parse_date(date):
    """
    Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
    (including square brackets) and returns a two element
    tuple containing first a timestamp of the form
    YYYYMMDDHH24IISS e.g. 20061205105144 and second the
    timezone offset as is e.g.;

    parse_date('[05/Dec/2006:10:51:44 +0000]')
    >> ('20061205105144', '+0000')

    It does not attempt to adjust the timestamp according
    to the timezone - this is your problem.
    """
    date = date[1:-1]
    elems = [
        date[7:11],
        months[date[3:6]],
        date[0:2],
        date[12:14],
        date[15:17],
        date[18:20],
    ]
    return (''.join(elems), date[21:])


"""
Frequenty used log formats stored here
"""
formats = {
    # Common Log Format (CLF)
    'common': r'%h %l %u %t \"%r\" %>s %b',

    # Common Log Format with Virtual Host
    'vhcommon': r'%v %h %l %u %t \"%r\" %>s %b',

    # NCSA extended/combined log format
    'extended': r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',

    # JvdB: extended with timing in nanosecs %D as last
    'extended_timed': r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D'
}

if __name__ == '__main__':
    import unittest

    class TestApacheLogParser(unittest.TestCase):
        def setUp(self):
            self.format = r'%h %l %u %t \"%r\" %>s ' \
                          r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
            self.fields = '%h %l %u %t %r %>s %b %{Referer}i ' \
                          '%{User-Agent}i'.split(' ')
            self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) ' \
                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" ' \
                           '(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" ' \
                           '\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
            self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] ' \
                         r'"GET /images/previous.png HTTP/1.1" 200 2607 ' \
                         r'"http://peterhi.dyndns.org/bandwidth/index.html" ' \
                         r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) ' \
                         r'Gecko/20021202"'
            self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] ' \
                         r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 ' \
                         r'"http://peterhi.dyndns.org/bandwidth/index.html" ' \
                         r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) ' \
                         r'Gecko/20021202"'
            self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] ' \
                         r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked' \
                         r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo=' \
                         r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/' \
                         r'bin/search?p=\"grady%20white%20306%20bimini\"" ' \
                         r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; ' \
                         r'YPC 3.0.3; yplus 4.0.00d)"'
            self.p = parser(self.format)

        def testpattern(self):
            self.assertEqual(self.pattern, self.p.pattern())

        def testnames(self):
            self.assertEqual(self.fields, self.p.names())

        def testline1(self):
            data = self.p.parse(self.line1)
            self.assertEqual(data['%h'], '212.74.15.68', msg='Line 1 %h')
            self.assertEqual(data['%l'], '-', msg='Line 1 %l')
            self.assertEqual(data['%u'], '-', msg='Line 1 %u')
            self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg='Line 1 %t')
            self.assertEqual(
                data['%r'],
                'GET /images/previous.png HTTP/1.1',
                msg='Line 1 %r'
            )
            self.assertEqual(data['%>s'], '200', msg='Line 1 %>s')
            self.assertEqual(data['%b'], '2607', msg='Line 1 %b')
            self.assertEqual(
                data['%{Referer}i'],
                'http://peterhi.dyndns.org/bandwidth/index.html',
                msg='Line 1 %{Referer}i'
            )
            self.assertEqual(
                data['%{User-Agent}i'],
                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
                msg='Line 1 %{User-Agent}i'
            )

        def testline2(self):
            data = self.p.parse(self.line2)
            self.assertEqual(data['%h'], '212.74.15.68', msg='Line 2 %h')
            self.assertEqual(data['%l'], '-', msg='Line 2 %l')
            self.assertEqual(data['%u'], '-', msg='Line 2 %u')
            self.assertEqual(
                data['%t'],
                '[23/Jan/2004:11:36:20 +0000]',
                msg='Line 2 %t'
            )
            self.assertEqual(
                data['%r'],
                r'GET /images/previous.png=\" HTTP/1.1',
                msg='Line 2 %r'
            )
            self.assertEqual(data['%>s'], '200', msg='Line 2 %>s')
            self.assertEqual(data['%b'], '2607', msg='Line 2 %b')
            self.assertEqual(
                data['%{Referer}i'],
                'http://peterhi.dyndns.org/bandwidth/index.html',
                msg='Line 2 %{Referer}i'
            )
            self.assertEqual(
                data['%{User-Agent}i'],
                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
                msg='Line 2 %{User-Agent}i'
            )

        def testline3(self):
            data = self.p.parse(self.line3)
            self.assertEqual(data['%h'], '4.224.234.46', msg='Line 3 %h')
            self.assertEqual(data['%l'], '-', msg='Line 3 %l')
            self.assertEqual(data['%u'], '-', msg='Line 3 %u')
            self.assertEqual(
                data['%t'],
                '[20/Jul/2004:13:18:55 -0700]',
                msg='Line 3 %t'
            )
            self.assertEqual(
                data['%r'],
                r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='
                r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '
                r'HTTP/1.1',
                msg='Line 3 %r'
            )
            self.assertEqual(data['%>s'], '200', msg='Line 3 %>s')
            self.assertEqual(data['%b'], '2888', msg='Line 3 %b')
            self.assertEqual(
                data['%{Referer}i'],
                r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'
                r'%20bimini\"',
                msg='Line 3 %{Referer}i'
            )
            self.assertEqual(
                data['%{User-Agent}i'],
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '
                'yplus 4.0.00d)',
                msg='Line 3 %{User-Agent}i'
            )

        def testjunkline(self):
            self.assertRaises(ApacheLogParserError, self.p.parse, 'foobar')

        def testhasquotesaltn(self):
            p = parser(r'%a \"%b\" %c')
            line = r'foo "xyz" bar'
            data = p.parse(line)
            self.assertEqual(data['%a'], 'foo', '%a')
            self.assertEqual(data['%b'], 'xyz', '%c')
            self.assertEqual(data['%c'], 'bar', '%c')

        def testparsedate(self):
            date = '[05/Dec/2006:10:51:44 +0000]'
            self.assertEqual(('20061205105144', '+0000'), parse_date(date))

    unittest.main()