This file is indexed.

/usr/lib/python2.7/dist-packages/stetl/filters/xmlelementreader.py is in python-stetl 1.1+ds-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
#
# Reads an XML file and returns XML elements.
# Based on inputs.fileinput.XmlElementStreamFileInput.
#
# Author: Frank Steggink
#
from copy import deepcopy

from stetl.component import Config
from stetl.filter import Filter
from stetl.util import Util, etree
from stetl.packet import FORMAT

log = Util.get_log('xmlelementreader')


class XmlElementReader(Filter):
    """
    Extracts XML elements from a file, outputs each feature element in Packet.
    Parsing is streaming (no internal DOM buildup) so any file size can be handled.
    Use this class for your big GML files!

    consumes=FORMAT.string, produces=FORMAT.etree_element
    """

    # Start attribute config meta
    @Config(ptype=list, default=None, required=True)
    def element_tags(self):
        """
        Comma-separated string of XML (feature) element tag names of the elements that should be extracted
        and added to the output element stream.
        """
        pass

    @Config(ptype=bool, default=False, required=False)
    def strip_namespaces(self):
        """
        should namespaces be removed from the input document and thus not be present in the output element stream?
        """
        pass

    # End attribute config meta

    # Constructor
    def __init__(self, configdict, section):
        Filter.__init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.etree_element)
        self.context = None
        self.root = None
        self.cur_file_path = None
        self.elem_count = 0
        log.info("Element tags to be matched: %s" % self.element_tags)

    def invoke(self, packet):
        if packet.data is None:
            log.info("No XML file given")
            return packet

        if self.cur_file_path is None:
            self.cur_file_path = packet.data

        event = None
        packet.data = None

        if self.context is None:
            # Open file
            fd = open(self.cur_file_path)
            self.elem_count = 0
            log.info("file opened : %s" % self.cur_file_path)
            self.context = etree.iterparse(fd, events=("start", "end"))
            self.context = iter(self.context)
            event, self.root = self.context.next()

        packet = self.process_xml(packet)

        return packet

    def process_xml(self, packet):
        while self.context is not None:
            # while not packet.is_end_of_doc():
            try:
                event, elem = self.context.next()
            except (etree.XMLSyntaxError, StopIteration):
                # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701
                self.context = None

            if self.context is None:
                # Always end of doc
                # TODO: is this still useful for a non-input component?
                packet.set_end_of_doc()
                log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count))

                return packet

            # Filter out Namespace from the tag
            # this is the easiest way to go for now
            tag = elem.tag.split('}')
            if len(tag) == 2:
                # Namespaced tag: 2nd is tag
                tag = tag[1]
            else:
                # Non-namespaced tag: first
                tag = tag[0]

            if tag in self.element_tags:
                if event == "start":
                    pass
                elif event == "end":
                    packet.data = deepcopy(elem)
                    self.elem_count += 1

                    if self.strip_namespaces:
                        packet.data = Util.stripNamespaces(elem).getroot()

                    # Clear the root element, since iterparse still builds a tree
                    # See http://effbot.org/zone/element-iterparse.htm
                    self.root.clear()

            # If there is a next component, let it process
            if self.next:
                # Hand-over data (line, doc whatever) to the next component
                packet.format = self._output_format
                packet = self.next.process(packet)

        return packet