/usr/share/perl5/Ninka/SentenceExtractor.pm is in ninka 1.3.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | package Ninka::SentenceExtractor;
use strict;
use warnings;
use File::Basename 'dirname';
use File::Spec::Functions 'catfile';
sub new {
my ($class, %args) = @_;
my $self = bless({}, $class);
die "parameter 'comments' is mandatory" unless exists $args{comments};
my $path = dirname(__FILE__);
$self->{verbose} = ($args{verbose} // 0) == 1;
$self->{comments} = $args{comments};
$self->{abbreviations} = load_abbreviations(catfile($path, 'abbreviations.dict'));
return $self;
}
sub execute {
my ($self) = @_;
my $text = $self->{comments};
my @clean_sentences = ();
# append a newline just in case
$text .= "\n";
# some characters are used to create lines
$text =~ s/\+?\-{3,1000}\+?/ /gmx;
$text =~ s/={3,1000}/ /gmx;
$text =~ s/:{3,1000}/ /gmx;
$text =~ s/\*{3,1000}/ /gmx;
# some characters are used for pretty-printing but never appear in sentences
$text =~ s/\|+/ /gmx;
$text =~ s/\\+/ /gmx;
# deal with comments /*, */ and //
$text =~ s@^[ \t]*/\*@@gmx;
$text =~ s@\*/[ \t]*$@@gmx;
$text =~ s@([^:])// @$1@gmx;
# normalize line separator
$text =~ s/\r\n/\n/g;
# try to replace the leading/ending character of each line #/-,
# at most 3 heading characters and each repeated as many times as necessary
$text =~ s/^[ \t]{0,3}[\*\#\/\;]+//gmx;
$text =~ s/^[ \t]{0,3}[\-]+//gmx;
$text =~ s/[\*\#\/]+[ \t]{0,3}$//gmx;
$text =~ s/[\-]+[ \t]{0,3}$//gmx;
# try to replace the ending character of each line if it is * or #
$text =~ s/[\*\#]+//gmx;
# get rid of lines with nothing but spaces
$text =~ s/^[ \t]+$/\n/gmx;
# let us try the following trick
# we first get rid of \t and replace it with ' '
# we then use \t as a "single line separator" and \n as multiple line
# so we can match each with a single character
$text =~ tr/\t/ /;
$text =~ s/\n(?!\n)/\t/g;
$text =~ s/\n\n+/\n/g;
$text .= "\n";
# this gets us in big trouble... licenses that have numeric abbreviations
$text =~ s/v\.\s+2\.0/v<dot> 2<dot>0/g;
while ($text =~ /^([^\n]*)\n/gsm) {
my $curr = $1;
# let us count the number of alphabetic chars to check if we are skipping anything we should not
my $count1 = 0;
for my $i (0..length($curr)-1) {
my $c = substr($curr, $i, 1);
$count1++ if ($c ge 'A' && $c le 'z');
}
my @sentences = $self->split_text($curr);
my $count2 = 0;
foreach my $sentence (@sentences) {
for my $i (0..length($sentence)-1) {
my $c = substr($sentence, $i, 1);
$count2++ if ($c ge 'A' && $c le 'z');
}
my $clean_sentence = clean_sentence($sentence);
push @clean_sentences, $clean_sentence if $clean_sentence;
}
if ($count1 != $count2) {
print STDERR "number of printable chars does not match for [$curr]: [$count1] vs. [$count2]\n";
foreach my $sentence (@sentences) {
my $clean_sentence = clean_sentence($sentence);
print STDERR "cleaned sentence [$clean_sentence]\n";
}
exit 1;
}
}
return \@clean_sentences;
}
sub clean_sentence {
($_) = @_;
# check for trailing bullets of different types
s/^o //;
s/^\s*[0-9]\{1-2\}+\s*[\-\)]//;
s/^[ \t]+//;
s/[ \t]+$//;
# remove a trailing -
s/^[ \t]*[\-\.\s*] +//;
s/\s+/ /g;
s/['"`]+/<quotes>/g;
s/:/<colon>/g;
s/\.+$/./;
die if /\n/m;
return $_;
}
sub split_text {
my ($self, $text) = @_;
my @result;
my $current_sentence = '';
# this breaks the sentence into
# 1. any text before a separator
# 2. the separator
# 3. any text after a separator
while ($text =~ /^
([^\.\!\?\:\n]*)
([\.\!\?\:\n])
(?=(.?))
/xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm) {
$text = $';
my $sentence_match = $1;
my $sentence = $1 . $2;
my $punctuation = $2;
my $after = $3;
# if next character is not a space, then we are not in a sentence"
if ($after ne ' ' && $after ne "\t") {
$current_sentence .= $sentence;
next;
}
# at this point we know that there is a space after
if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') {
# let us consider this right here a beginning of a sentence
push @result, $current_sentence . $sentence;
$current_sentence = '';
next;
}
if ($punctuation eq '.') {
# we have a bunch of alternatives
# for the time being just consider a new sentence
# TODO
# simple heuristic... let us check that the next words are not the beginning of a sentence
# in our library
# END TODO
# is the last word an abbreviation? for this the period has to follow the word.
# this expression might have to be updated to take care of special characters in names. :(
if ($sentence_match =~ /(.?)([^[:punct:]\s]+)$/) {
my $before = $1;
my $last_word = $2;
# is it an abbreviation
if (length($last_word) == 1) {
# single character abbreviations are special...
# we will assume they never split the sentence if they are capitalized.
if ($last_word ge 'A' && $last_word le 'Z') {
$current_sentence .= $sentence;
next;
}
print STDERR "1 last word an abbrev $sentence_match lastword [$last_word] before [$before]\n" if $self->{verbose};
# but some are lowercase!
if ($last_word eq 'e' || $last_word eq 'i') {
$current_sentence .= $sentence;
next;
}
print STDERR "2 last word an abbrev $sentence_match lastword [$last_word] before [$before]\n" if $self->{verbose};
} else {
$last_word = lc $last_word;
# Only accept abbreviations if the previous char is empty (beginning of line) or a space.
# This avoids things like .c
if (($before eq '' || $before eq ' ') && $self->{abbreviations}{$last_word}) {
$current_sentence .= $sentence;
next;
} else {
# just keep going, we handle this case below
}
}
}
push @result, $current_sentence . $sentence;
$current_sentence = '';
next;
}
die 'We have not dealt with this case';
}
push @result, $current_sentence . $text;
return @result;
}
sub load_abbreviations {
my ($file) = @_;
my %abbreviations = ();
open my $fh, '<', $file or die "can't open file [$file]: $!";
while (my $line = <$fh>) {
chomp $line;
$abbreviations{$line} = $line;
}
close $fh;
return \%abbreviations;
}
1;
__END__
=head1 NAME
Ninka::SentenceExtractor - Break comments into sentences
=head1 DESCRIPTION
Breaks comments into sentences.
=head1 NOTES
This list of abbreviations was extracted from SCOWL (Spell Checker Oriented Word Lists)
by Kevin Atkinson (kevina@gnu.org) version 2015.04.24.
Specifically it was created from scowl-2015.04.24.tar.gz. by running:
cat *abbrev* | sort -u > abbreviations.dict
It also contains some additions by D.M German.
See Copyright.SCOWL for license.
=head1 COPYRIGHT AND LICENSE
Author: Paul Clough
With modifications by Daniel M German and Y. Manabe,
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
This patch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=cut
|