/usr/lib/python2.7/dist-packages/tagger_aux.py is in python-libhfst 3.10.0~r2798-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | # @file tagger_aux.py
#
# @author Miikka Silfverberg
#
# @brief Auxiliary functions for computing statistics from a POS
# tagger training file.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the Licence.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import math
import collections
import string
import sys
import re
valid_pattern_str = \
"^(((NONE)|(WORD)) ((NONE)|(TAG)) )*((NONE)|(WORD)) ((NONE)|(TAG))$"
class InvalidPattern(Exception):
def __init__(self):
super(InvalidPattern, self).__init__()
class InvalidConfigLine(Exception):
def __init__(self):
super(InvalidConfigLine, self).__init__()
class ReachesSequenceEnd(Exception):
def __init__(self):
super(ReachesSequenceEnd, self).__init__()
class SequenceSimplifier:
def __init__(self,pattern):
for line in pattern:
if len(line) != 2 or (not line[0] in [0,1]) or \
(not line[1] in [0,1]):
raise InvalidPattern()
self.pattern = pattern
def simplify(self,i,sequence):
if i + len(self.pattern) >= len(sequence):
raise ReachesSequenceEnd()
simplified_subsequence = []
for pair in self.pattern:
simplified_entry = ["<NONE>","<NONE>"]
if pair[0] == 1:
simplified_entry[0] = sequence[i][0]
if pair[1] == 1:
simplified_entry[1] = sequence[i][1]
simplified_subsequence.append(tuple(simplified_entry))
i += 1
return tuple(simplified_subsequence)
class Pattern:
def __init__(self,numerator, denominator, name):
self.numerator = numerator
self.denominator = denominator
self.name = name
self.order = len(numerator.pattern) - 1
# Read statistics patterns from file config_file_name.
def read_config_file(config_file_name):
config_file = open(config_file_name)
config_data = config_file.read().split("\n")
patterns = []
for line in config_data:
if line == "":
continue
patterns.append(parse_config_line(line))
return patterns
def parse_config_pattern(pattern_str):
if re.match(valid_pattern_str,pattern_str) == None:
raise InvalidConfigLine()
field_strings = pattern_str.split(" ")
pattern = []
i = 0
while i < len(field_strings) - 1:
word_string = field_strings[i]
tag_string = field_strings[i+1]
pattern.append([(lambda x: x == "WORD" and 1 or 0)(word_string),
(lambda x: x == "TAG" and 1 or 0)(tag_string)])
i += 2
return SequenceSimplifier(pattern)
def parse_config_line(line):
fields = re.split("\t+",line)
if len(fields) != 4:
raise InvalidConfigLine()
name = fields[0]
numerator_pattern_str = fields[1]
denominator_pattern_str = fields[2]
weigth_str = fields[3]
numerator_simplifier = parse_config_pattern(numerator_pattern_str)
denominator_simplifier = parse_config_pattern(denominator_pattern_str)
return Pattern(numerator_simplifier, denominator_simplifier, name)
# Unless line is a utf-8 encoded line of number_of_fields tab
# separated fields, raise an exception.
def check_line(line,number_of_fields):
line_copy = line
try:
line_copy.decode("utf-8")
except UnicodeDecodeError:
raise Exception("Invalid utf-8 encoding on line")
if len(line_copy.split("\t")) != number_of_fields:
raise Exception("Wrong number of fields on line")
# Return tropical penalty corresponding to the probability
# suffix_and_tag_count/suffix_count.
def get_penalty(suffix_and_tag_count, suffix_count):
return math.log(suffix_count) - math.log(suffix_and_tag_count)
def verbose_print(message, is_verbose):
if is_verbose:
sys.stderr.write(message + "\n")
# Return the utf-8 string str reversed. utf-8 symbols are not
# internally reversed.
def reverse(str):
return str.decode("utf-8")[::-1]
# Return a map for counting pairs e.g. word form and tag pairs.
def get_object_counter():
return collections.defaultdict(lambda : 0.0)
# Return a map for counting objects, e.g. word forms.
def get_pair_counter():
return collections.defaultdict(lambda : get_object_counter())
# Return a mapping from pairs in pair_counter to their conditional
# probability given the second member of the pair. The counts of pairs
# are given by pair_counter and the counts of the members by
# object_counter.
def get_conditional_penalty_map(pair_counter, object_counter):
penalty_map = {}
for first_member, second_member_dict in pair_counter.iteritems():
for second_member, pair_count in second_member_dict.iteritems():
penalty = get_penalty(pair_count, object_counter[second_member])
penalty_map[(first_member, second_member)] = penalty
return penalty_map
# return a mapping from objects to their probabilities. The count of
# each object type is given by object_counter and the total number of
# objects is total_count.
def get_penalty_map(object_counter, total_count):
penalty_map = {}
for object, count in object_counter.iteritems():
penalty = get_penalty(count, total_count)
penalty_map[object] = penalty
return penalty_map
def print_conditional_penalties(pair_counter, object_counter,
appended_suffix,
invert_fields,
print_maximum_penalty):
penalty_map = get_conditional_penalty_map(pair_counter, object_counter)
if print_maximum_penalty:
max_object_count = 0
for count in object_counter.itervalues():
max_object_count = max(max_object_count, count)
print "PENALTY_WEIGHT=" + str(get_penalty(1, max_object_count + 1))
for pair, penalty in penalty_map.iteritems():
if type(pair[0]) == type(u"") or type(pair[0]) == type(""):
p = list(pair)
try:
p[0] = p[0].decode("utf-8")
except UnicodeEncodeError:
pass
try:
p[1] = p[1].decode("utf-8")
except UnicodeEncodeError:
pass
if not invert_fields:
s = string.join([p[0] + appended_suffix,
p[1],
str(penalty)],"\t")
print s.encode("utf-8")
else:
s = string.join([p[1] + appended_suffix,
p[0],
str(penalty)],"\t")
print s.encode("utf-8")
else:
for entry in pair[0]:
e = list(entry)
try:
e[0] = e[0].decode("utf-8")
except UnicodeEncodeError:
pass
try:
e[1] = e[1].decode("utf-8")
except UnicodeEncodeError:
pass
s = e[0] + "\t" + e[1] + "\t"
print s.encode("utf-8"),
print penalty
def print_penalties(object_counter, total_count, appended_suffix):
penalty_map = get_penalty_map(object_counter, total_count)
for object, penalty in penalty_map.iteritems():
o = object
try:
o = o.decode("utf-8")
except UnicodeEncodeError:
pass
s = o + appended_suffix + "T\t" + str(penalty)
print s.encode("utf-8")
|