This file is indexed.

/usr/bin/tegaki-eval is in python-tegakitools 0.3.1-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2009 The Tegaki project contributors
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

# Contributors to this file:
# - Mathieu Blondel

import sys
import os
import time
from optparse import OptionParser

from tegaki.charcol import CharacterCollection
from tegaki.recognizer import Recognizer, RecognizerError

from tegakitools.charcol import *

VERSION = '0.3.1'

def harmonic_mean(x1, x2):
    if x1 == 0.0 and x2 == 0.0:
        return 0.0
    else:
        return 2 * float(x1 * x2) / float(x1 + x2)

class TegakiEvalError(Exception):
    pass

class TegakiEval(object):

    MATCH_RESULTS = (1, 5, 10)

    def __init__(self, options, args):
        self._verbosity_level = options.verbosity_level
        self._directories = options.directories
        self._databases = options.databases
        self._charcols = options.charcols
        self._tomoe = options.tomoe
        self._kuchibue = options.kuchibue
        self._list = options.list
        self._include = options.include
        self._exclude = options.exclude
        self._max_samples = options.max_samples

        if not self._list:
            self._recognizer = args[0]
            self._model = args[1]

    def run(self):
        if self._list:
            self._list_recognizers()
        else:
            self._recognize()

    def _list_recognizers(self):
        avail_recognizers = Recognizer.get_all_available_models()
        print "\n".join(["- %s (%s)" % (model, recog) for recog, model, meta \
                         in avail_recognizers])

    def _recognize(self):
        charcol = get_aggregated_charcol(
                        ((TYPE_CHARCOL, self._charcols),
                         (TYPE_CHARCOL_DB, self._databases),
                         (TYPE_DIRECTORY, self._directories),
                         (TYPE_TOMOE, self._tomoe),
                         (TYPE_KUCHIBUE, self._kuchibue)))


        charcol.include_characters_from_files(self._include)
        charcol.exclude_characters_from_files(self._exclude)

        # max samples
        if self._max_samples:
            charcol.remove_samples(keep_at_most=self._max_samples)

        # FIXME: don't load all characters in memory
        all_chars = charcol.get_all_characters()

        if len(all_chars) == 0:
            raise TegakiEvalError, "No character samples to evaluate!"

        recognizer_class = self._get_recognizer_class()
        recognizer = self._get_recognizer(recognizer_class)

        self._eval(recognizer, all_chars)

    def _get_recognizer_class(self):
        avail_recognizers = Recognizer.get_available_recognizers()

        if not self._recognizer in avail_recognizers:
            err = "Not an available recognizer!\n"
            err += "Available ones include: %s" % \
                ", ".join(avail_recognizers.keys())
            raise TegakiEvalError, err

        return avail_recognizers[self._recognizer]

    def _get_recognizer(self, recognizer_class):
        recognizer = recognizer_class()
        if os.path.exists(self._model):
            # the path exists so we consider the parameter to be a model path
            method = recognizer.open

            # try to find a .meta file
            meta_file = self._model.replace(".model", ".meta")
            if os.path.exists(meta_file) and meta_file.endswith(".meta"):
                try:
                    meta = Recognizer.read_meta_file(meta_file)
                except RecognizerError, e:
                    raise TegakiEvalError, str(e) 
            else:
                meta = {}
        else:
            # otherwise we consider the parameter to be a model name
            avail_models = recognizer_class.get_available_models()
            if not self._model in avail_models:
                err = "Not an available model!\n"
                err += "Available ones include: %s" % \
                    ", ".join(["\"%s\"" % k for k in avail_models.keys()])
                raise TegakiEvalError, err 

            meta = avail_models[self._model]
            method = recognizer.set_model
        try:
            method(self._model)
            recognizer.set_options(meta)
        except RecognizerError, e:
            raise TegakiEvalError, str(e)

        return recognizer

    def _eval(self, recognizer, all_chars):
        # number of samples present per character
        n_samples = {}
        # number of correctly predicted samples per character
        n_corr_pred = {}
        # number of times a character was predicted (correctly or not)
        n_pred = {}

        for n in self.MATCH_RESULTS:
            n_corr_pred[n] = {}
            n_pred[n] = {}

        # calculate our statistics for each character

        canddict = {} # store ALL the candidate results for verbosity >= 2

        start_time = time.time()

        for char in all_chars:
            utf8 = char.get_utf8()
            if not utf8:
                continue

            n_samples[utf8] = n_samples.get(utf8, 0) + 1

            cand = recognizer.recognize(char.get_writing(), 
                                        n=max(self.MATCH_RESULTS))
            cand = [char for char, prob in cand] # we don't need the probability

            if self._verbosity_level >= 2:
                if utf8 not in canddict: canddict[utf8] = []
                canddict[utf8].append(cand)

            for n in self.MATCH_RESULTS:
                if utf8 in cand[0:n]:
                    n_corr_pred[n][utf8] = n_corr_pred[n].get(utf8, 0) + 1
                for c in cand[0:n]:
                    n_pred[n][c] = n_pred[n].get(c, 0) + 1

        end_time = time.time()

        # Calculate accuracy/recall and precision for each character
        # Print the overall results
        print "Overall results"
        print "\tRecognizer: %s" % self._recognizer
        print "\tNumber of characters evaluated: %d\n" % len(all_chars)
        total_time = end_time - start_time
        print "\tTotal time: %0.2f sec" % float(total_time)
        print "\tAverage time per character: %0.2f sec" % \
            (float(total_time) / len(all_chars))
        print "\tRecognition speed: %0.2f char/sec\n" % \
            (len(all_chars) / float(total_time))

        total_samples = sum(n_samples.values())
        recall = {}
        precision = {}
        for n in self.MATCH_RESULTS:
            recall[n] = {}
            precision[n] = {}

        for n in self.MATCH_RESULTS:
            total_corr_pred = sum(n_corr_pred[n].values())
            #total_pred = sum(n_pred[n].values())

            recall_sum = 0
            precision_sum = 0

            for k in n_samples.keys():

                # recall accounts for the recognizer "completeness"
                # i.e. number of correct predictions / number of samples
                recall[n][k] = float(n_corr_pred[n].get(k, 0)) / \
                               float(n_samples[k])
                recall_sum += recall[n][k]

                # i.e. number of correct predictions / number of predictions
                try:
                    precision[n][k] = float(n_corr_pred[n].get(k, 0)) / \
                                       float(n_pred[n][k])
                except KeyError:
                    precision[n][k] = 0

                precision_sum += precision[n][k]
            
            recall_sum *= 100 / float(len(n_samples))
            precision_sum *= 100 / float(len(n_samples))

            print "\tmatch%d" % n
            print "\t\tAccuracy/Recall: %0.2f" % recall_sum
            if n == 1:
                # Precision doesn't make sense for n > 1
                print "\t\tPrecision: %0.2f" % precision_sum
                print "\t\tF1 score: %0.2f" % harmonic_mean(recall_sum, 
                                                            precision_sum)
            print ""

        # verbosity level 1
        if self._verbosity_level < 1:
            return

        print "Result details"
        for k in n_samples.keys():
            print "\tCharacter: %s" %k
            print "\tNumber of samples: %d\n" % n_samples[k]

            for n in self.MATCH_RESULTS:
                print "\t\tmatch%d" % n
                print "\t\tAccuracy/Recall: %0.2f" % (recall[n][k] * 100)
                if n == 1:
                    # Precision doesn't make sense for n > 1
                    print "\t\tPrecision: %0.2f" % (precision[n][k] * 100)
                    f1s = harmonic_mean(recall[n][k], precision[n][k]) * 100
                    print "\t\tF1 score: %0.2f" % f1s
                print ""

            if self._verbosity_level < 2:
                continue
            
            # verbosity level 2
            print "\tCandidates:"
            i = 0
            for cand in canddict[k]:
                print "\tsample%d: %s" % (i, ", ".join(cand))
                i += 1
            print ""

usage = """usage: %prog [options] recognizer model

recognizer        a recognizer available on the system

model             a model name available for that recognizer on the system OR
                  the direct file path to the model
"""
parser = OptionParser(usage=usage, version="%prog " + VERSION)

parser.add_option("-v", "--verbosity-level",
                  type="int", dest="verbosity_level", default=0,
                  help="verbosity level between 0 and 2")


parser.add_option("-d", "--directory",
                  action="append", type="string", dest="directories",
                  default=[],
                  help="directory containing individual XML character files")
parser.add_option("-c", "--charcol",
                  action="append", type="string", dest="charcols",
                  default=[],
                  help="character collection XML files")
parser.add_option("-b", "--db",
                  action="append", type="string", dest="databases",
                  default=[],
                  help="character collection XML files")
parser.add_option("-t", "--tomoe-dict",
                  action="append", type="string", dest="tomoe",
                  default=[],
                  help="Tomoe XML dictionary files")
parser.add_option("-k", "--kuchibue",
                  action="append", type="string", dest="kuchibue",
                  default=[],
                  help="Kuchibue unipen database")


parser.add_option("-l", "--list",
                  action="store_true",dest="list", default=False,
                  help="List available recognizers and models")



parser.add_option("-i", "--include",
                  action="append", type="string", dest="include",
                  default=[],
                  help="File containing characters to include")
parser.add_option("-e", "--exclude",
                  action="append", type="string", dest="exclude",
                  default=[],
                  help="File containing characters to exclude")
parser.add_option("-m", "--max-samples",
                  type="int", dest="max_samples",
                  help="Maximum number of samples per character")



(options, args) = parser.parse_args()

try:
    if not options.list and len(args) < 2:
        raise TegakiEvalError, "Needs a recognizer and a model!"

    TegakiEval(options, args).run()
except TegakiEvalError, e:
    sys.stderr.write(str(e) + "\n\n")
    parser.print_help()
    sys.exit(1)