/usr/lib/python3/dist-packages/decopy/parsers.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*- vim60:fdm=marker
#
# Copyright: 2016, Maximiliano Curia <maxy@debian.org>
#
# License: ISC
#  Permission to use, copy, modify, and/or distribute this software for any
#  purpose with or without fee is hereby granted, provided that the above
#  copyright notice and this permission notice appear in all copies.
#  .
#  THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
#  REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
#  AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
#  INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
#  LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
#  OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
#  PERFORMANCE OF THIS SOFTWARE.

'''File Parsers'''

import functools
import logging
import shutil
import string
import subprocess
import tokenize

import xdg.Mime

# Local modules
from .datatypes import CopyrightHolder
from .matchers import find_licenses, parse_copyright, clean_comments, parse_holders


def generic_parser(filename):

    content = ''
    holders = []

    with open(filename, 'rb') as f:
        continuation = None
        for raw_line in f:
            try:
                line = raw_line.decode('utf-8')
            except UnicodeDecodeError:
                if has_control_chars(raw_line):
                    # binary?
                    chars = (str(char) for char in raw_line
                             if char in PRINTABLE_BYTES)
                    line = ''.join(chars)
                else:
                    line = raw_line.decode('latin1')
            content += line
            line = line.rstrip('\n')
            copyrights, continuation = parse_copyright(line, continuation)
            if not copyrights:
                continue
            for copyright_ in copyrights:
                holder = CopyrightHolder.from_copyright(copyright_)
                if holder:
                    holders.append(holder)

    content = clean_comments(content)

    licenses = find_licenses(content)

    return holders, licenses


@functools.lru_cache()
def cmd_parser_factory(*cmd):

    def _parser(filename):

        fullcmd = cmd + (filename,)

        try:
            content = subprocess.check_output(fullcmd, universal_newlines=True,
                                              stderr=subprocess.STDOUT)
        except (subprocess.CalledProcessError, UnicodeDecodeError) as e:
            logging.info('failed to parse %s with %s, ignoring (%s)',
                         filename, cmd, e)
            content = ''

        holders = parse_holders(content)
        content = clean_comments(content)
        licenses = find_licenses(content)

        return holders, licenses

    if shutil.which(cmd[0]) is None:
        logging.warn('command %s not found, using generic parser as fallback',
                     cmd[0])
        return generic_parser

    return _parser


def python_parser(filename):
    '''Extract comments and doc strings from a python file'''

    lines = []
    newline = True
    with open(filename, 'rb') as f:
        for token in tokenize.tokenize(f.readline):
            if (token.type == tokenize.COMMENT) or \
               (newline and token.type == tokenize.STRING):
                lines.append(token.string)
            elif token.type == tokenize.NEWLINE:
                newline = True
            elif newline and (token.type in {tokenize.INDENT, tokenize.NL}):
                continue
            else:
                newline = False
    content = '\n'.join(lines)

    holders = parse_holders(content)
    content = clean_comments(content)
    licenses = find_licenses(content)

    return holders, licenses


KNOWN_PARSERS = {
    'application/gzip': cmd_parser_factory('zcat'),
    'application/x-bzip': cmd_parser_factory('bzcat'),
    'application/x-lzma': cmd_parser_factory('xzcat'),
    'application/x-xz': cmd_parser_factory('xzcat'),
    'text/x-python': python_parser,
    'image/jpeg': cmd_parser_factory('exiv2'),
    'image/png': cmd_parser_factory('exiv2'),
}


def has_control_chars(bytes_seq):
    for i in bytes_seq:
        # ascii control chars (C0), and delete (DEL)
        if i < 32 or i == 127:
            # except tab (HT), line feed (LF) and carriage return (CR)
            if i in (9, 10, 13):
                continue
            return True
    return False


PRINTABLE_BYTES = bytes(string.printable, encoding='ascii')


def parse_file(fullname, options):
    '''Parses the received file with the matching parser.

    Returns:
        The list of copyrights and licenses obtained by the parser.
    '''
    if options.text:
        parser = generic_parser
    else:
        xdg_type = xdg.Mime.get_type(fullname)
        logging.debug('Type for %s is: %s', fullname, xdg_type)

        parser = KNOWN_PARSERS.get(str(xdg_type), generic_parser)

    copyrights, licenses = parser(fullname)
    logging.debug('Parsed %s: %s, %s', fullname, copyrights, licenses)
    return copyrights, licenses
decopy 0.2.2-1 / usr / lib / python3 / dist-packages / decopy / parsers.py