/usr/lib/python2.7/dist-packages/rdflib/compare.py is in python-rdflib 4.1.2-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | # -*- coding: utf-8 -*-
"""
A collection of utilities for canonicalizing and inspecting graphs.
Among other things, they solve of the problem of deterministic bnode
comparisons.
Warning: the time to canonicalize bnodes may increase exponentially on larger
graphs. Use with care!
Example of comparing two graphs::
>>> g1 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel
... <http://example.org/same>,
... [ :label "Same" ],
... <http://example.org/a>,
... [ :label "A" ] .
... ''')
>>> g2 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel
... <http://example.org/same>,
... [ :label "Same" ],
... <http://example.org/b>,
... [ :label "B" ] .
... ''')
>>>
>>> iso1 = to_isomorphic(g1)
>>> iso2 = to_isomorphic(g2)
These are not isomorphic::
>>> iso1 == iso2
False
Diff the two graphs::
>>> in_both, in_first, in_second = graph_diff(iso1, iso2)
Present in both::
>>> def dump_nt_sorted(g):
... for l in sorted(g.serialize(format='nt').splitlines()):
... if l: print(l.decode('ascii'))
>>> dump_nt_sorted(in_both) #doctest: +SKIP
<http://example.org>
<http://example.org/ns#rel> <http://example.org/same> .
<http://example.org>
<http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e .
_:cbcaabaaba17fecbc304a64f8edee4335e
<http://example.org/ns#label> "Same" .
Only in first::
>>> dump_nt_sorted(in_first) #doctest: +SKIP
<http://example.org>
<http://example.org/ns#rel> <http://example.org/a> .
<http://example.org>
<http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 .
_:cb124e4c6da0579f810c0ffe4eff485bd9
<http://example.org/ns#label> "A" .
Only in second::
>>> dump_nt_sorted(in_second) #doctest: +SKIP
<http://example.org>
<http://example.org/ns#rel> <http://example.org/b> .
<http://example.org>
<http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 .
_:cb558f30e21ddfc05ca53108348338ade8
<http://example.org/ns#label> "B" .
"""
# TODO:
# - Doesn't handle quads.
# - Add warning and/or safety mechanism before working on large graphs?
# - use this in existing Graph.isomorphic?
__all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic',
'to_canonical_graph', 'graph_diff', 'similar']
from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate
from rdflib.term import BNode
try:
import hashlib
md = hashlib.md5
except ImportError:
# for Python << 2.5
import md5
md = md5.new
class IsomorphicGraph(ConjunctiveGraph):
"""
Ported from
<http://www.w3.org/2001/sw/DataAccess/proto-tests/tools/rdfdiff.py>
(Sean B Palmer's RDF Graph Isomorphism Tester).
"""
def __init__(self, **kwargs):
super(IsomorphicGraph, self).__init__(**kwargs)
def __eq__(self, other):
"""Graph isomorphism testing."""
if not isinstance(other, IsomorphicGraph):
return False
elif len(self) != len(other):
return False
elif list(self) == list(other):
return True # TODO: really generally cheaper?
return self.internal_hash() == other.internal_hash()
def __ne__(self, other):
"""Negative graph isomorphism testing."""
return not self.__eq__(other)
def internal_hash(self):
"""
This is defined instead of __hash__ to avoid a circular recursion
scenario with the Memory store for rdflib which requires a hash lookup
in order to return a generator of triples.
"""
return _TripleCanonicalizer(self).to_hash()
class _TripleCanonicalizer(object):
def __init__(self, graph, hashfunc=hash):
self.graph = graph
self.hashfunc = hashfunc
def to_hash(self):
return self.hashfunc(tuple(sorted(
map(self.hashfunc, self.canonical_triples()))))
def canonical_triples(self):
for triple in self.graph:
yield tuple(self._canonicalize_bnodes(triple))
def _canonicalize_bnodes(self, triple):
for term in triple:
if isinstance(term, BNode):
yield BNode(value="cb%s" % self._canonicalize(term))
else:
yield term
def _canonicalize(self, term, done=False):
return self.hashfunc(tuple(sorted(self._vhashtriples(term, done),
key=_hetero_tuple_key)))
def _vhashtriples(self, term, done):
for triple in self.graph:
if term in triple:
yield tuple(self._vhashtriple(triple, term, done))
def _vhashtriple(self, triple, target_term, done):
for i, term in enumerate(triple):
if not isinstance(term, BNode):
yield term
elif done or (term == target_term):
yield i
else:
yield self._canonicalize(term, done=True)
def _hetero_tuple_key(x):
"Sort like Python 2 - by name of type, then by value. Expects tuples."
return tuple((type(a).__name__, a) for a in x)
def to_isomorphic(graph):
if isinstance(graph, IsomorphicGraph):
return graph
return IsomorphicGraph(store=graph.store)
def isomorphic(graph1, graph2):
"""
Compare graph for equality. Uses an algorithm to compute unique hashes
which takes bnodes into account.
Examples::
>>> g1 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel <http://example.org/a> .
... <http://example.org> :rel <http://example.org/b> .
... <http://example.org> :rel [ :label "A bnode." ] .
... ''')
>>> g2 = Graph().parse(format='n3', data='''
... @prefix ns: <http://example.org/ns#> .
... <http://example.org> ns:rel [ ns:label "A bnode." ] .
... <http://example.org> ns:rel <http://example.org/b>,
... <http://example.org/a> .
... ''')
>>> isomorphic(g1, g2)
True
>>> g3 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel <http://example.org/a> .
... <http://example.org> :rel <http://example.org/b> .
... <http://example.org> :rel <http://example.org/c> .
... ''')
>>> isomorphic(g1, g3)
False
"""
return _TripleCanonicalizer(graph1).to_hash() == \
_TripleCanonicalizer(graph2).to_hash()
def to_canonical_graph(g1):
"""
Creates a canonical, read-only graph where all bnode id:s are based on
deterministical MD5 checksums, correlated with the graph contents.
"""
graph = Graph()
graph += _TripleCanonicalizer(g1, _md5_hash).canonical_triples()
return ReadOnlyGraphAggregate([graph])
def graph_diff(g1, g2):
"""
Returns three sets of triples: "in both", "in first" and "in second".
"""
# bnodes have deterministic values in canonical graphs:
cg1 = to_canonical_graph(g1)
cg2 = to_canonical_graph(g2)
in_both = cg1 * cg2
in_first = cg1 - cg2
in_second = cg2 - cg1
return (in_both, in_first, in_second)
def _md5_hash(t):
h = md()
for i in t:
if isinstance(i, tuple):
h.update(_md5_hash(i).encode('ascii'))
else:
h.update(unicode(i).encode("utf8"))
return h.hexdigest()
_MOCK_BNODE = BNode()
def similar(g1, g2):
"""
Checks if the two graphs are "similar", by comparing sorted triples where
all bnodes have been replaced by a singular mock bnode (the
``_MOCK_BNODE``).
This is a much cheaper, but less reliable, alternative to the comparison
algorithm in ``isomorphic``.
"""
return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2))
def _squashed_graphs_triples(g1, g2):
for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))):
yield t1, t2
def _squash_graph(graph):
return (_squash_bnodes(triple) for triple in graph)
def _squash_bnodes(triple):
return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple)
|