This file is indexed.

/usr/lib/python2.7/dist-packages/rdflib/TextIndex.py is in python-rdflib 2.4.2-3build1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
try:
    from hashlib import md5
except ImportError:
    from md5 import md5
    
from rdflib.BNode import BNode
from rdflib.Graph import ConjunctiveGraph
from rdflib.Literal import Literal
from rdflib.Namespace import NamespaceDict as Namespace
from rdflib.URIRef import URIRef
from rdflib.store import TripleAddedEvent, TripleRemovedEvent
from rdflib.store.IOMemory import IOMemory
import logging
import re #, stopdict

_logger = logging.getLogger(__name__)

def get_stopdict():
    """Return a dictionary of stopwords."""
    return _dict

_words = [
    "a", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "such",
    "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
]

_dict = {}
for w in _words:
    _dict[w] = None

word_pattern = re.compile(r"(?u)\w+")
has_stop = get_stopdict().has_key

def splitter(s):
    return word_pattern.findall(s)

def stopper(s):
    return [w.lower() for w in s if not has_stop(w)]



class TextIndex(ConjunctiveGraph):
    """
    An rdflib graph event handler than indexes text literals that are
    added to a another graph.

    This class lets you 'search' the text literals in an RDF graph.
    Typically in RDF to search for a substring in an RDF graph you
    would have to 'brute force' search every literal string looking
    for your substring.

    Instead, this index stores the words in literals into another
    graph whose structure makes searching for terms much less
    expensive.  It does this by chopping up the literals into words,
    removing very common words (currently only in English) and then
    adding each of those words into an RDF graph that describes the
    statements in the original graph that the word came from.

    First, let's create a graph that will transmit events and a text
    index that will receive those events, and then subscribe the text
    index to the event graph:

      >>> e = ConjunctiveGraph()
      >>> t = TextIndex()
      >>> t.subscribe_to(e)

    When triples are added to the event graph (e) events will be fired
    that trigger event handlers in subscribers.  In this case our only
    subscriber is a text index and its action is to index triples that
    contain literal RDF objects.  Here are 3 such triples:

      >>> e.add((URIRef('a'), URIRef('title'), Literal('one two three')))
      >>> e.add((URIRef('b'), URIRef('title'), Literal('two three four')))
      >>> e.add((URIRef('c'), URIRef('title'), Literal('three four five')))

    Of the three literal objects that were added, they all contain
    five unique terms.  These terms can be queried directly from the
    text index:
    
      >>> t.term_strings() ==  set(['four', 'five', 'three', 'two', 'one'])
      True

    Now we can search for statement that contain certain terms.  Let's
    search for 'one' which occurs in only one of the literals
    provided, 'a'.  This can be queried for:

      >>> t.search('one')
      set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None)])

    'one' and 'five' only occur in one statement each, 'two' and
    'four' occur in two, and 'three' occurs in three statements:

      >>> len(list(t.search('one')))
      1
      >>> len(list(t.search('two')))
      2
      >>> len(list(t.search('three')))
      3
      >>> len(list(t.search('four')))
      2
      >>> len(list(t.search('five')))
      1

    Lets add some more statements with different predicates.

      >>> e.add((URIRef('a'), URIRef('creator'), Literal('michel')))
      >>> e.add((URIRef('b'), URIRef('creator'), Literal('Atilla the one Hun')))
      >>> e.add((URIRef('c'), URIRef('creator'), Literal('michel')))
      >>> e.add((URIRef('d'), URIRef('creator'), Literal('Hun Mung two')))

    Now 'one' occurs in two statements:

      >>> assert len(list(t.search('one'))) == 2

    And 'two' occurs in three statements, here they are:

      >>> t.search('two')
      set([(rdflib.URIRef('d'), rdflib.URIRef('creator'), None), (rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])

    The predicates that are searched can be restricted by provding an
    argument to 'search()':

      >>> t.search('two', URIRef('creator'))
      set([(rdflib.URIRef('d'), rdflib.URIRef('creator'), None)])

      >>> t.search('two', URIRef(u'title'))
      set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])

    You can search for more than one term by simply including it in
    the query:
    
      >>> t.search('two three', URIRef(u'title'))
      set([(rdflib.URIRef('c'), rdflib.URIRef('title'), None), (rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])

    The above query returns all the statements that contain 'two' OR
    'three'.  For the documents that contain 'two' AND 'three', do an
    intersection of two queries:

      >>> t.search('two', URIRef(u'title')).intersection(t.search(u'three', URIRef(u'title')))
      set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])

    Intersection two queries like this is probably not the most
    efficient way to do it, but for reasonable data sets this isn't a
    problem.  Larger data sets will want to query the graph with
    sparql or something else more efficient.

    In all the above queries, the object of each statement was always
    'None'.  This is because the index graph does not store the object
    data, that would make it very large, and besides the data is
    available in the original data graph.  For convenience, a method
    is provides to 'link' an index graph to a data graph.  This allows
    the index to also provide object data in query results.

      >>> t.link_to(e)
      >>> set([str(i[2]) for i in t.search('two', URIRef(u'title')).intersection(t.search(u'three', URIRef(u'title')))]) ==  set(['two three four', 'one two three'])
      True

    You can remove the link by assigning None:

      >>> t.link_to(None)

    Unindexing means to remove statments from the index graph that
    corespond to a statement in the data graph.  Note that while it is
    possible to remove the index information of the occurances of
    terms in statements, it is not possible to remove the terms
    themselves, terms are 'absolute' and are never removed from the
    index graph.  This is not a problem since languages have finite
    terms:

      >>> e.remove((URIRef('a'), URIRef('creator'), Literal('michel')))
      >>> e.remove((URIRef('b'), URIRef('creator'), Literal('Atilla the one Hun')))
      >>> e.remove((URIRef('c'), URIRef('creator'), Literal('michel')))
      >>> e.remove((URIRef('d'), URIRef('creator'), Literal('Hun Mung two')))

    Now 'one' only occurs in one statement:

      >>> assert len(list(t.search('one'))) == 1

    And 'two' only occurs in two statements, here they are:

      >>> t.search('two')
      set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])

    The predicates that are searched can be restricted by provding an
    argument to 'search()':

      >>> t.search('two', URIRef(u'creator'))
      set([])

      >>> t.search('two', URIRef(u'title'))
      set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])

    """

    linked_data = None

    text_index = Namespace('http://rdflib.net/text_index#')
    term = Namespace('http://rdflib.net/text_index#')["term"]
    termin = Namespace('http://rdflib.net/text_index#')["termin"]

    def __init__(self, store='default'):
        super(TextIndex, self).__init__(store)

    def add_handler(self, event):
        if type(event.triple[2]) is Literal:
            self.index(event.triple)
        
    def remove_handler(self, event):
        if type(event.triple[2]) is Literal:
            self.unindex(event.triple)

    def index(self, (s, p, o)):
        # this code is tricky so it's annotated.  unindex is the reverse of this method.
                
        if type(o) is Literal:                            # first, only index statements that have a literal object
            for word in stopper(splitter(o)):             # split the literal and remove any stopwords
                word = Literal(word)                      # create a new literal for each word in the object
                
                # if that word already exists in the statement
                # loop over each context the term occurs in
                if self.value(predicate=self.term, object=word, any=True): 
                    for t in set(self.triples((None, self.term, word))):
                        t = t[0]
                        # if the graph does not contain an occurance of the term in the statement's subject
                        # then add it
                        if not (t, self.termin, s) in self:
                            self.add((t, self.termin, s))

                        # ditto for the predicate
                        if not (p, t, s) in self:
                            self.add((p, t, s))

                else: # if the term does not exist in the graph, add it, and the references to the statement.
                    # t gets used as a predicate, create identifier accordingly (AKA can't be a BNode)
                    h = md5(word.encode('utf-8')); h.update(s.encode('utf-8')); h.update(p.encode('utf-8'))
                    t = self.text_index["term_%s" % h.hexdigest()]
                    self.add((t, self.term, word))
                    self.add((t, self.termin, s))
                    self.add((p, t, s))
        
    def unindex(self, (s, p, o)):
        if type(o) is Literal:
            for word in stopper(splitter(o)):
                word = Literal(word)
                if self.value(predicate=self.term, object=word, any=True):
                    for t in self.triples((None, self.term, word)):
                        t = t[0]
                        if (t, self.termin, s) in self:
                            self.remove((t, self.termin, s))
                        if (p, t, s) in self:
                            self.remove((p, t, s))

    def terms(self):
        """ Returns a generator that yields all of the term literals in the graph. """
        return set(self.objects(None, self.term))

    def term_strings(self):
        """ Return a list of term strings. """
        return set([str(i) for i in self.terms()])

    def search(self, terms, predicate=None):
        """ Returns a set of all the statements the term occurs in. """
        if predicate and not isinstance(predicate, URIRef):
            _logger.warning("predicate is not a URIRef")
            predicate = URIRef(predicate)
        results = set()
        terms = [Literal(term) for term in stopper(splitter(terms))]    

        for term in terms:
            for t in self.triples((None, self.term, term)):
                for o in self.objects(t[0], self.termin):
                    for p in self.triples((predicate, t[0], o)):
                        if self.linked_data is None:
                            results.add((o, p[0], None))
                        else:
                            results.add((o, p[0], self.linked_data.value(o, p[0])))
        return results

    def index_graph(self, graph):
        """
        Index a whole graph.  Must be a conjunctive graph.
        """
        for t in graph.triples((None,None,None)):
            self.index(t)

    def link_to(self, graph):
        """
        Link to a graph
        """
        self.linked_data = graph

    def subscribe_to(self, graph):
        """
        Subscribe this index to a graph.
        """
        graph.store.dispatcher.subscribe(TripleAddedEvent, self.add_handler)
        graph.store.dispatcher.subscribe(TripleRemovedEvent, self.remove_handler)


def test():
    import doctest
    doctest.testmod()

if __name__ == '__main__':
    test()