/usr/share/perl5/Plucene/Analysis/CharTokenizer.pm is in libplucene-perl 1.25-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | package Plucene::Analysis::CharTokenizer;
=head1 NAME
Plucene::Analysis::CharTokenizer - base class for character tokenisers
=head1 SYNOPSIS
# isa Plucene::Analysis::Tokenizer
my $next = $chartokenizer->next;
=head1 DESCRIPTION
This is an abstract base class for simple, character-oriented tokenizers.
=head1 METHODS
=cut
use strict;
use warnings;
use Carp;
use Plucene::Analysis::Token;
use base 'Plucene::Analysis::Tokenizer';
=head2 token_re
This should be defined in subclasses.
=cut
# And here we deviate from the script
sub token_re { die "You should define this" }
# Class::Virtually::Abstract doesn't like being called twice.
=head2 normalize
This will normalise the character before it is added to the token.
=cut
sub normalize { return $_[1] }
=head2 next
my $next = $chartokenizer->next;
This will return the next token in the string, or undef at the end
of the string.
=cut
sub next {
my $self = shift;
my $re = $self->token_re();
my $fh = $self->{reader};
retry:
if (!defined $self->{buffer} or !length $self->{buffer}) {
return if eof($fh);
$self->{start} = tell($fh);
$self->{buffer} .= <$fh>;
}
return unless length $self->{buffer};
if ($self->{buffer} =~ s/(.*?)($re)//) {
$self->{start} += length $1;
my $word = $self->normalize($2);
my $rv = Plucene::Analysis::Token->new(
text => $word,
start => $self->{start},
end => ($self->{start} + length($word)));
$self->{start} += length($word);
return $rv;
}
# No match, rest of buffer is useless.
$self->{buffer} = "";
# But we should try for some more text
goto retry;
}
1;
|