/usr/lib/python2.7/dist-packages/whoosh/highlight.py is in python-whoosh 2.7.0-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 | # Copyright 2008 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
"""The highlight module contains classes and functions for displaying short
excerpts from hit documents in the search results you present to the user, with
query terms highlighted.
The highlighting system has four main elements.
* **Fragmenters** chop up the original text into __fragments__, based on the
locations of matched terms in the text.
* **Scorers** assign a score to each fragment, allowing the system to rank the
best fragments by whatever criterion.
* **Order functions** control in what order the top-scoring fragments are
presented to the user. For example, you can show the fragments in the order
they appear in the document (FIRST) or show higher-scoring fragments first
(SCORE)
* **Formatters** turn the fragment objects into human-readable output, such as
an HTML string.
See :doc:`/highlight` for more information.
"""
from __future__ import division
from collections import deque
from heapq import nlargest
from itertools import groupby
from whoosh.compat import htmlescape
from whoosh.analysis import Token
# The default value for the maximum chars to examine when fragmenting
DEFAULT_CHARLIMIT = 2 ** 15
# Fragment object
def mkfrag(text, tokens, startchar=None, endchar=None,
charsbefore=0, charsafter=0):
"""Returns a :class:`Fragment` object based on the :class:`analysis.Token`
objects in ``tokens`.
"""
if startchar is None:
startchar = tokens[0].startchar if tokens else 0
if endchar is None:
endchar = tokens[-1].endchar if tokens else len(text)
startchar = max(0, startchar - charsbefore)
endchar = min(len(text), endchar + charsafter)
return Fragment(text, tokens, startchar, endchar)
class Fragment(object):
"""Represents a fragment (extract) from a hit document. This object is
mainly used to keep track of the start and end points of the fragment and
the "matched" character ranges inside; it does not contain the text of the
fragment or do much else.
The useful attributes are:
``Fragment.text``
The entire original text from which this fragment is taken.
``Fragment.matches``
An ordered list of objects representing the matched terms in the
fragment. These objects have ``startchar`` and ``endchar`` attributes.
``Fragment.startchar``
The index of the first character in the fragment.
``Fragment.endchar``
The index of the last character in the fragment.
``Fragment.matched_terms``
A ``set`` of the ``text`` of the matched terms in the fragment (if
available).
"""
def __init__(self, text, matches, startchar=0, endchar= -1):
"""
:param text: the source text of the fragment.
:param matches: a list of objects which have ``startchar`` and
``endchar`` attributes, and optionally a ``text`` attribute.
:param startchar: the index into ``text`` at which the fragment starts.
The default is 0.
:param endchar: the index into ``text`` at which the fragment ends.
The default is -1, which is interpreted as the length of ``text``.
"""
self.text = text
self.matches = matches
if endchar == -1:
endchar = len(text)
self.startchar = startchar
self.endchar = endchar
self.matched_terms = set()
for t in matches:
if hasattr(t, "text"):
self.matched_terms.add(t.text)
def __repr__(self):
return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
len(self.matches))
def __len__(self):
return self.endchar - self.startchar
def overlaps(self, fragment):
sc = self.startchar
ec = self.endchar
fsc = fragment.startchar
fec = fragment.endchar
return (sc < fsc < ec) or (sc < fec < ec)
def overlapped_length(self, fragment):
sc = self.startchar
ec = self.endchar
fsc = fragment.startchar
fec = fragment.endchar
return max(ec, fec) - min(sc, fsc)
def __lt__(self, other):
return id(self) < id(other)
# Tokenizing
def set_matched_filter(tokens, termset):
for t in tokens:
t.matched = t.text in termset
yield t
# Fragmenters
class Fragmenter(object):
def must_retokenize(self):
"""Returns True if this fragmenter requires retokenized text.
If this method returns True, the fragmenter's ``fragment_tokens``
method will be called with an iterator of ALL tokens from the text,
with the tokens for matched terms having the ``matched`` attribute set
to True.
If this method returns False, the fragmenter's ``fragment_matches``
method will be called with a LIST of matching tokens.
"""
return True
def fragment_tokens(self, text, all_tokens):
"""Yields :class:`Fragment` objects based on the tokenized text.
:param text: the string being highlighted.
:param all_tokens: an iterator of :class:`analysis.Token`
objects from the string.
"""
raise NotImplementedError
def fragment_matches(self, text, matched_tokens):
"""Yields :class:`Fragment` objects based on the text and the matched
terms.
:param text: the string being highlighted.
:param matched_tokens: a list of :class:`analysis.Token` objects
representing the term matches in the string.
"""
raise NotImplementedError
class WholeFragmenter(Fragmenter):
"""Doesn't fragment the token stream. This object just returns the entire
entire stream as one "fragment". This is useful if you want to highlight
the entire text.
Note that even if you use the `WholeFragmenter`, the highlight code will
return no fragment if no terms matched in the given field. To return the
whole fragment even in that case, call `highlights()` with `minscore=0`::
# Query where no terms match in the "text" field
q = query.Term("tag", "new")
r = mysearcher.search(q)
r.fragmenter = highlight.WholeFragmenter()
r.formatter = highlight.UppercaseFormatter()
# Since no terms in the "text" field matched, we get no fragments back
assert r[0].highlights("text") == ""
# If we lower the minimum score to 0, we get a fragment even though it
# has no matching terms
assert r[0].highlights("text", minscore=0) == "This is the text field."
"""
def __init__(self, charlimit=DEFAULT_CHARLIMIT):
self.charlimit = charlimit
def fragment_tokens(self, text, tokens):
charlimit = self.charlimit
matches = []
for t in tokens:
if charlimit and t.endchar > charlimit:
break
if t.matched:
matches.append(t.copy())
return [Fragment(text, matches)]
# Backwards compatiblity
NullFragmeter = WholeFragmenter
class SentenceFragmenter(Fragmenter):
"""Breaks the text up on sentence end punctuation characters
(".", "!", or "?"). This object works by looking in the original text for a
sentence end as the next character after each token's 'endchar'.
When highlighting with this fragmenter, you should use an analyzer that
does NOT remove stop words, for example::
sa = StandardAnalyzer(stoplist=None)
"""
def __init__(self, maxchars=200, sentencechars=".!?",
charlimit=DEFAULT_CHARLIMIT):
"""
:param maxchars: The maximum number of characters allowed in a
fragment.
"""
self.maxchars = maxchars
self.sentencechars = frozenset(sentencechars)
self.charlimit = charlimit
def fragment_tokens(self, text, tokens):
maxchars = self.maxchars
sentencechars = self.sentencechars
charlimit = self.charlimit
textlen = len(text)
# startchar of first token in the current sentence
first = None
# Buffer for matched tokens in the current sentence
tks = []
endchar = None
# Number of chars in the current sentence
currentlen = 0
for t in tokens:
startchar = t.startchar
endchar = t.endchar
if charlimit and endchar > charlimit:
break
if first is None:
# Remember the startchar of the first token in a sentence
first = startchar
currentlen = 0
tlength = endchar - startchar
currentlen += tlength
if t.matched:
tks.append(t.copy())
# If the character after the current token is end-of-sentence
# punctuation, finish the sentence and reset
if endchar < textlen and text[endchar] in sentencechars:
# Don't break for two periods in a row (e.g. ignore "...")
if endchar + 1 < textlen and text[endchar + 1] in sentencechars:
continue
# If the sentence had matches and it's not too long, yield it
# as a token
if tks and currentlen <= maxchars:
yield mkfrag(text, tks, startchar=first, endchar=endchar)
# Reset the counts
tks = []
first = None
currentlen = 0
# If we get to the end of the text and there's still a sentence
# in the buffer, yield it
if tks:
yield mkfrag(text, tks, startchar=first, endchar=endchar)
class ContextFragmenter(Fragmenter):
"""Looks for matched terms and aggregates them with their surrounding
context.
"""
def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT):
"""
:param maxchars: The maximum number of characters allowed in a
fragment.
:param surround: The number of extra characters of context to add both
before the first matched term and after the last matched term.
"""
self.maxchars = maxchars
self.surround = surround
self.charlimit = charlimit
def fragment_tokens(self, text, tokens):
maxchars = self.maxchars
surround = self.surround
charlimit = self.charlimit
# startchar of the first token in the fragment
first = None
# Stack of startchars
firsts = deque()
# Each time we see a matched token, we reset the countdown to finishing
# the fragment. This also indicates whether we're currently inside a
# fragment (< 0 not in fragment, >= 0 in fragment)
countdown = -1
# Tokens in current fragment
tks = []
endchar = None
# Number of chars in the current fragment
currentlen = 0
for t in tokens:
startchar = t.startchar
endchar = t.endchar
tlength = endchar - startchar
if charlimit and endchar > charlimit:
break
if countdown < 0 and not t.matched:
# We're not in a fragment currently, so just maintain the
# "charsbefore" buffer
firsts.append(startchar)
while firsts and endchar - firsts[0] > surround:
firsts.popleft()
elif currentlen + tlength > maxchars:
# We're in a fragment, but adding this token would put us past
# the maximum size. Zero the countdown so the code below will
# cause the fragment to be emitted
countdown = 0
elif t.matched:
# Start/restart the countdown
countdown = surround
# Remember the first char of this fragment
if first is None:
if firsts:
first = firsts[0]
else:
first = startchar
# Add on unused front context
countdown += surround
tks.append(t.copy())
# If we're in a fragment...
if countdown >= 0:
# Update the counts
currentlen += tlength
countdown -= tlength
# If the countdown is expired
if countdown <= 0:
# Finish the fragment
yield mkfrag(text, tks, startchar=first, endchar=endchar)
# Reset the counts
tks = []
firsts = deque()
first = None
currentlen = 0
# If there's a fragment left over at the end, yield it
if tks:
yield mkfrag(text, tks, startchar=first, endchar=endchar)
class PinpointFragmenter(Fragmenter):
"""This is a NON-RETOKENIZING fragmenter. It builds fragments from the
positions of the matched terms.
"""
def __init__(self, maxchars=200, surround=20, autotrim=False,
charlimit=DEFAULT_CHARLIMIT):
"""
:param maxchars: The maximum number of characters allowed in a
fragment.
:param surround: The number of extra characters of context to add both
before the first matched term and after the last matched term.
:param autotrim: automatically trims text before the first space and
after the last space in the fragments, to try to avoid truncated
words at the start and end. For short fragments or fragments with
long runs between spaces this may give strange results.
"""
self.maxchars = maxchars
self.surround = surround
self.autotrim = autotrim
self.charlimit = charlimit
def must_retokenize(self):
return False
def fragment_tokens(self, text, tokens):
matched = [t for t in tokens if t.matched]
return self.fragment_matches(text, matched)
@staticmethod
def _autotrim(fragment):
text = fragment.text
startchar = fragment.startchar
endchar = fragment.endchar
firstspace = text.find(" ", startchar, endchar)
if firstspace > 0:
startchar = firstspace + 1
lastspace = text.rfind(" ", startchar, endchar)
if lastspace > 0:
endchar = lastspace
if fragment.matches:
startchar = min(startchar, fragment.matches[0].startchar)
endchar = max(endchar, fragment.matches[-1].endchar)
fragment.startchar = startchar
fragment.endchar = endchar
def fragment_matches(self, text, tokens):
maxchars = self.maxchars
surround = self.surround
autotrim = self.autotrim
charlimit = self.charlimit
j = -1
for i, t in enumerate(tokens):
if j >= i:
continue
j = i
left = t.startchar
right = t.endchar
if charlimit and right > charlimit:
break
currentlen = right - left
while j < len(tokens) - 1 and currentlen < maxchars:
next = tokens[j + 1]
ec = next.endchar
if ec - right <= surround and ec - left <= maxchars:
j += 1
right = ec
currentlen += (ec - next.startchar)
else:
break
left = max(0, left - surround)
right = min(len(text), right + surround)
fragment = Fragment(text, tokens[i:j + 1], left, right)
if autotrim:
self._autotrim(fragment)
yield fragment
# Fragment scorers
class FragmentScorer(object):
pass
class BasicFragmentScorer(FragmentScorer):
def __call__(self, f):
# Add up the boosts for the matched terms in this passage
score = sum(t.boost for t in f.matches)
# Favor diversity: multiply score by the number of separate
# terms matched
score *= (len(f.matched_terms) * 100) or 1
return score
# Fragment sorters
def SCORE(fragment):
"Sorts higher scored passages first."
return 1
def FIRST(fragment):
"Sorts passages from earlier in the document first."
return fragment.startchar
def LONGER(fragment):
"Sorts longer passages first."
return 0 - len(fragment)
def SHORTER(fragment):
"Sort shorter passages first."
return len(fragment)
# Formatters
def get_text(original, token, replace):
"""Convenience function for getting the text to use for a match when
formatting.
If ``replace`` is False, returns the part of ``original`` between
``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns
``token.text``.
"""
if replace:
return token.text
else:
return original[token.startchar:token.endchar]
class Formatter(object):
"""Base class for formatters.
For highlighters that return strings, it is usually only necessary to
override :meth:`Formatter.format_token`.
Use the :func:`get_text` function as a convenience to get the token text::
class MyFormatter(Formatter):
def format_token(text, token, replace=False):
ttext = get_text(text, token, replace)
return "[%s]" % ttext
"""
between = "..."
def _text(self, text):
return text
def format_token(self, text, token, replace=False):
"""Returns a formatted version of the given "token" object, which
should have at least ``startchar`` and ``endchar`` attributes, and
a ``text`` attribute if ``replace`` is True.
:param text: the original fragment text being highlighted.
:param token: an object having ``startchar`` and ``endchar`` attributes
and optionally a ``text`` attribute (if ``replace`` is True).
:param replace: if True, the original text between the token's
``startchar`` and ``endchar`` indices will be replaced with the
value of the token's ``text`` attribute.
"""
raise NotImplementedError
def format_fragment(self, fragment, replace=False):
"""Returns a formatted version of the given text, using the "token"
objects in the given :class:`Fragment`.
:param fragment: a :class:`Fragment` object representing a list of
matches in the text.
:param replace: if True, the original text corresponding to each
match will be replaced with the value of the token object's
``text`` attribute.
"""
output = []
index = fragment.startchar
text = fragment.text
for t in fragment.matches:
if t.startchar is None:
continue
if t.startchar < index:
continue
if t.startchar > index:
output.append(self._text(text[index:t.startchar]))
output.append(self.format_token(text, t, replace))
index = t.endchar
output.append(self._text(text[index:fragment.endchar]))
out_string = "".join(output)
return out_string
def format(self, fragments, replace=False):
"""Returns a formatted version of the given text, using a list of
:class:`Fragment` objects.
"""
formatted = [self.format_fragment(f, replace=replace)
for f in fragments]
return self.between.join(formatted)
def __call__(self, text, fragments):
# For backwards compatibility
return self.format(fragments)
class NullFormatter(Formatter):
"""Formatter that does not modify the string.
"""
def format_token(self, text, token, replace=False):
return get_text(text, token, replace)
class UppercaseFormatter(Formatter):
"""Returns a string in which the matched terms are in UPPERCASE.
"""
def __init__(self, between="..."):
"""
:param between: the text to add between fragments.
"""
self.between = between
def format_token(self, text, token, replace=False):
ttxt = get_text(text, token, replace)
return ttxt.upper()
class HtmlFormatter(Formatter):
"""Returns a string containing HTML formatting around the matched terms.
This formatter wraps matched terms in an HTML element with two class names.
The first class name (set with the constructor argument ``classname``) is
the same for each match. The second class name (set with the constructor
argument ``termclass`` is different depending on which term matched. This
allows you to give different formatting (for example, different background
colors) to the different terms in the excerpt.
>>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term")
>>> hf(mytext, myfragments)
"The <span class="match term0">template</span> <span class="match term1">geometry</span> is..."
This object maintains a dictionary mapping terms to HTML class names (e.g.
``term0`` and ``term1`` above), so that multiple excerpts will use the same
class for the same term. If you want to re-use the same HtmlFormatter
object with different searches, you should call HtmlFormatter.clear()
between searches to clear the mapping.
"""
template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s</%(tag)s>'
def __init__(self, tagname="strong", between="...",
classname="match", termclass="term", maxclasses=5,
attrquote='"'):
"""
:param tagname: the tag to wrap around matching terms.
:param between: the text to add between fragments.
:param classname: the class name to add to the elements wrapped around
matching terms.
:param termclass: the class name prefix for the second class which is
different for each matched term.
:param maxclasses: the maximum number of term classes to produce. This
limits the number of classes you have to define in CSS by recycling
term class names. For example, if you set maxclasses to 3 and have
5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``,
``term2``, ``term0``, ``term1``.
"""
self.between = between
self.tagname = tagname
self.classname = classname
self.termclass = termclass
self.attrquote = attrquote
self.maxclasses = maxclasses
self.seen = {}
self.htmlclass = " ".join((self.classname, self.termclass))
def _text(self, text):
return htmlescape(text, quote=False)
def format_token(self, text, token, replace=False):
seen = self.seen
ttext = self._text(get_text(text, token, replace))
if ttext in seen:
termnum = seen[ttext]
else:
termnum = len(seen) % self.maxclasses
seen[ttext] = termnum
return self.template % {"tag": self.tagname, "q": self.attrquote,
"cls": self.htmlclass, "t": ttext,
"tn": termnum}
def clean(self):
"""Clears the dictionary mapping terms to HTML classnames.
"""
self.seen = {}
class GenshiFormatter(Formatter):
"""Returns a Genshi event stream containing HTML formatting around the
matched terms.
"""
def __init__(self, qname="strong", between="..."):
"""
:param qname: the QName for the tag to wrap around matched terms.
:param between: the text to add between fragments.
"""
self.qname = qname
self.between = between
from genshi.core import START, END, TEXT # @UnresolvedImport
from genshi.core import Attrs, Stream # @UnresolvedImport
self.START, self.END, self.TEXT = START, END, TEXT
self.Attrs, self.Stream = Attrs, Stream
def _add_text(self, text, output):
if output and output[-1][0] == self.TEXT:
output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])
else:
output.append((self.TEXT, text, (None, -1, -1)))
def format_token(self, text, token, replace=False):
qn = self.qname
txt = get_text(text, token, replace)
return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)),
(self.TEXT, txt, (None, -1, -1)),
(self.END, qn, (None, -1, -1))])
def format_fragment(self, fragment, replace=False):
output = []
index = fragment.startchar
text = fragment.text
for t in fragment.matches:
if t.startchar > index:
self._add_text(text[index:t.startchar], output)
output.append((text, t, replace))
index = t.endchar
if index < len(text):
self._add_text(text[index:], output)
return self.Stream(output)
def format(self, fragments, replace=False):
output = []
first = True
for fragment in fragments:
if not first:
self._add_text(self.between, output)
output += self.format_fragment(fragment, replace=replace)
first = False
return self.Stream(output)
# Highlighting
def top_fragments(fragments, count, scorer, order, minscore=1):
scored_fragments = ((scorer(f), f) for f in fragments)
scored_fragments = nlargest(count, scored_fragments)
best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
best_fragments.sort(key=order)
return best_fragments
def highlight(text, terms, analyzer, fragmenter, formatter, top=3,
scorer=None, minscore=1, order=FIRST, mode="query"):
if scorer is None:
scorer = BasicFragmentScorer()
if type(fragmenter) is type:
fragmenter = fragmenter()
if type(formatter) is type:
formatter = formatter()
if type(scorer) is type:
scorer = scorer()
if scorer is None:
scorer = BasicFragmentScorer()
termset = frozenset(terms)
tokens = analyzer(text, chars=True, mode=mode, removestops=False)
tokens = set_matched_filter(tokens, termset)
fragments = fragmenter.fragment_tokens(text, tokens)
fragments = top_fragments(fragments, top, scorer, order, minscore)
return formatter(text, fragments)
class Highlighter(object):
def __init__(self, fragmenter=None, scorer=None, formatter=None,
always_retokenize=False, order=FIRST):
self.fragmenter = fragmenter or ContextFragmenter()
self.scorer = scorer or BasicFragmentScorer()
self.formatter = formatter or HtmlFormatter(tagname="b")
self.order = order
self.always_retokenize = always_retokenize
def can_load_chars(self, results, fieldname):
# Is it possible to build a mapping between the matched terms/docs and
# their start and end chars for "pinpoint" highlighting (ie not require
# re-tokenizing text)?
if self.always_retokenize:
# No, we've been configured to always retokenize some text
return False
if not results.has_matched_terms():
# No, we don't know what the matched terms are yet
return False
if self.fragmenter.must_retokenize():
# No, the configured fragmenter doesn't support it
return False
# Maybe, if the field was configured to store characters
field = results.searcher.schema[fieldname]
return field.supports("characters")
@staticmethod
def _load_chars(results, fieldname, texts, to_bytes):
# For each docnum, create a mapping of text -> [(startchar, endchar)]
# for the matched terms
results._char_cache[fieldname] = cache = {}
sorted_ids = sorted(docnum for _, docnum in results.top_n)
for docnum in sorted_ids:
cache[docnum] = {}
for text in texts:
btext = to_bytes(text)
m = results.searcher.postings(fieldname, btext)
docset = set(results.termdocs[(fieldname, btext)])
for docnum in sorted_ids:
if docnum in docset:
m.skip_to(docnum)
assert m.id() == docnum
cache[docnum][text] = m.value_as("characters")
@staticmethod
def _merge_matched_tokens(tokens):
# Merges consecutive matched tokens together, so they are highlighted
# as one
token = None
for t in tokens:
if not t.matched:
if token is not None:
yield token
token = None
yield t
continue
if token is None:
token = t.copy()
elif t.startchar <= token.endchar:
if t.endchar > token.endchar:
token.text += t.text[token.endchar-t.endchar:]
token.endchar = t.endchar
else:
yield token
token = None
if token is not None:
yield token
def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
results = hitobj.results
schema = results.searcher.schema
field = schema[fieldname]
to_bytes = field.to_bytes
from_bytes = field.from_bytes
if text is None:
if fieldname not in hitobj:
raise KeyError("Field %r is not stored." % fieldname)
text = hitobj[fieldname]
# Get the terms searched for/matched in this field
if results.has_matched_terms():
bterms = (term for term in results.matched_terms()
if term[0] == fieldname)
else:
bterms = results.query_terms(expand=True, fieldname=fieldname)
# Convert bytes to unicode
words = frozenset(from_bytes(term[1]) for term in bterms)
# If we can do "pinpoint" highlighting...
if self.can_load_chars(results, fieldname):
# Build the docnum->[(startchar, endchar),] map
if fieldname not in results._char_cache:
self._load_chars(results, fieldname, words, to_bytes)
hitterms = (from_bytes(term[1]) for term in hitobj.matched_terms()
if term[0] == fieldname)
# Grab the word->[(startchar, endchar)] map for this docnum
cmap = results._char_cache[fieldname][hitobj.docnum]
# A list of Token objects for matched words
tokens = []
charlimit = self.fragmenter.charlimit
for word in hitterms:
chars = cmap[word]
for pos, startchar, endchar in chars:
if charlimit and endchar > charlimit:
break
tokens.append(Token(text=word, pos=pos,
startchar=startchar, endchar=endchar))
tokens.sort(key=lambda t: t.startchar)
tokens = [max(group, key=lambda t: t.endchar - t.startchar)
for key, group in groupby(tokens, lambda t: t.startchar)]
fragments = self.fragmenter.fragment_matches(text, tokens)
else:
# Retokenize the text
analyzer = results.searcher.schema[fieldname].analyzer
tokens = analyzer(text, positions=True, chars=True, mode="index",
removestops=False)
# Set Token.matched attribute for tokens that match a query term
tokens = set_matched_filter(tokens, words)
tokens = self._merge_matched_tokens(tokens)
fragments = self.fragmenter.fragment_tokens(text, tokens)
fragments = top_fragments(fragments, top, self.scorer, self.order,
minscore=minscore)
output = self.formatter.format(fragments)
return output
|