This file is indexed.

/usr/bin/tegaki-bootstrap is in python-tegakitools 0.3.1-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2009 The Tegaki project contributors
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

# Contributors to this file:
# - Christoph Burgmer, cburgmer ÂT ira DÔT uka DÔT de (main author)
# - Mathieu Blondel

"""
Tries to bootstrap a character collection using similar collections and
knowledge about the characters shape data.

Basically all characters whose components have data, can be interpolated by
scaling and shifting the components data and merging them together.

Example:
黴 is made up from the component structure ⿲彳⿳山一黑攵 or graphically
 山
彳一攵
 黑
If we have handwriting data of the five components, we can easily integrate that
into the character in question.

To bootstrap a Traditional Chinese collection run:
$ tegaki-bootstrap --domain=BIG5 --locale=T --max-samples=1 \
    handwriting-zh_TW.xml -d ~/xml/ \
    -t tegaki-zinnia-simplified-chinese/handwriting-zh_CN.xml \
    -t tegaki-zinnia-japanese/handwriting-ja.xml

TODO:
    - Use bounding box when merging of components. For example above 彳 probably
      has a bounding box of witdth=0.5, which only needs to be scaled by 3/5,
      and 一 has a bounding box of height=0.2, which allows to scale the upper
      and lower components less aggressivly.
    - Map collection sources to a character locale, so we can exclude data that
      does not fit to the target locale. Furthermore instead of doing exact
      transformations for source characters from the wrong locale, try using
      component data.
    - Try similar characters if no exact character can be found?
    - Offer easy way to provide additional variations, e.g. stroke order
"""
import sys
import locale
import random
random.seed(12345) # provide deterministic results
from optparse import OptionParser

from tegaki.character import CharacterCollection, Writing, Character

from tegakitools.charcol import *

try:
    from cjklib.characterlookup import CharacterLookup
    from cjklib.exception import UnsupportedError
except ImportError:
    print "Please install cjklib (http://code.google.com/p/cjklib)"
    sys.exit(1)

VERSION = '0.3'

class TegakiBootstrapError(Exception):
    pass

class TegakiBootstrap(object):
    # xrate, yrate, dx, dy
    COMPONENT_TRANSFORMATION = {
        u'⿰': [(0.5, 1, 0, 0), (0.5, 1, 0.5, 0)],
        u'⿱': [(1, 0.5, 0, 0), (1, 0.5, 0, 0.5)],
        #u'⿴': [(1, 1, 0, 0), (0.5, 0.5, 0.25, 0.25)],
        u'⿵': [(1, 1, 0, 0), (0.5, 0.75, 0.25, 0.25)],
        u'⿶': [(1, 1, 0, 0), (0.5, 0.75, 0.25, 0)],
        #u'⿷': [(1, 1, 0, 0), (0.75, 0.5, 0.25, 0.25)],
        u'⿸': [(1, 1, 0, 0), (0.75, 0.75, 0.25, 0.25)],
        u'⿹': [(1, 1, 0, 0), (0.75, 0.75, 0, 0.25)],
        u'⿺': [(1, 1, 0, 0), (0.75, 0.75, 0.25, 0)],
        u'⿲': [(0.33, 1, 0, 0), (0.33, 1, 0.33, 0), (0.33, 1, 0.66, 0)],
        u'⿳': [(1, 0.33, 0, 0), (1, 0.33, 0, 0.33), (1, 0.33, 0, 0.66)],
        }

    RADICALS_NON_VISUAL_EQUIVALENCE = set([u'⺄', u'⺆', u'⺇', u'⺈', u'⺊',
        u'⺌', u'⺍', u'⺎', u'⺑', u'⺗', u'⺜', u'⺥', u'⺧', u'⺪', u'⺫', u'⺮',
        u'⺳', u'⺴', u'⺶', u'⺷', u'⺻', u'⺼', u'⻏', u'⻕'])
    """
    Radical forms that have a equivalent character form with no visual
    resemblance.
    """

    MIN_COMPONENT_PRODUCTIVITY = 2
    """
    Min productivity when reporting out-domain components that could help boost
    the in-domain set.
    """

    def __init__(self, options, args):
        self._directories = options.directories
        self._charcols = options.charcols
        self._tomoe = options.tomoe
        self._kuchibue = options.kuchibue
        self._include = options.include
        self._exclude = options.exclude
        self._max_samples = options.max_samples
        assert self._max_samples
        self._locale = options.locale
        self._character_domain = options.character_domain
        self._no_exact_transformation = options.no_exact_transformation
        self._quiet = options.quiet

        try:
            self._output_path = args[0]
        except:
            self._output_path = None

        self._cjk = CharacterLookup(self._locale, self._character_domain)

    def _get_charcol(self):
        _charcol = get_aggregated_charcol(((TYPE_CHARCOL, self._charcols),
                                           (TYPE_DIRECTORY, self._directories),
                                           (TYPE_TOMOE, self._tomoe),
                                           (TYPE_KUCHIBUE, self._kuchibue)))

        _charcol.include_characters_from_files(self._include)
        _charcol.exclude_characters_from_files(self._exclude)

        # max samples
        _charcol.remove_samples(keep_at_most=self._max_samples)

        return _charcol

    def run(self):
        # do the bootstrapping
        to_charcol = self.bootstrap()

        # max samples
        #to_charcol.remove_samples(keep_at_most=self._max_samples)

        # output
        if not self._output_path:
            # outputs to stdout if no output path specified
            print to_charcol.to_xml()
        else:
            gzip = False; bz2 = False
            if self._output_path.endswith(".gz"): gzip = True
            if self._output_path.endswith(".bz2"): bz2 = True
            to_charcol.write(self._output_path, gzip=gzip, bz2=bz2)

    def bootstrap(self):
        n_chars = 0
        n_exact_transformations = 0
        char_n_exact_transformations = 0
        n_decomposition_transformations = 0
        char_n_decomposition_transformations = 0

        char_n_missing_transformations = 0
        char_n_underrepresented = 0

        to_charcol = CharacterCollection()

        missing_char_dict = {}
        missing_single_characters = []
        # iterate through all characters of the target character set
        for target_char in self._cjk.getDomainCharacterIterator():
        #for target_char in iter([u'亄', u'乿', u'仜', u'伳']): # DEBUG
            n_chars += 1
            if n_chars % 100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            char_set = target_char.encode('utf8')
            source_character_lookup = self._get_source_character_lookup()

            exact_transformations = 0
            if (target_char in source_character_lookup
                and not self._no_exact_transformation):
                char_n_exact_transformations += 1

                to_charcol.add_set(char_set)
                source_chars = source_character_lookup[target_char]
                for character in source_chars[:self._max_samples]:
                    exact_transformations += 1
                    to_charcol.append_character(char_set, character)

            n_exact_transformations += exact_transformations
            n_total_transformation = exact_transformations

            # fill up with decomposition transformations?
            need_n_chars = self._max_samples - exact_transformations

            decomposition_transformations = 0
            if need_n_chars > 0:
                writing_objects, missing_chars \
                    = self.get_writings_from_decomposition(target_char,
                        force_decomposition=True)
                if writing_objects:
                    char_n_decomposition_transformations += 1

                    writing_objects = writing_objects[:need_n_chars]
                    for writing in writing_objects:
                        decomposition_transformations += 1

                        character = Character()
                        character.set_writing(writing)
                        character.set_unicode(target_char)
                        to_charcol.append_character(char_set, character)

                n_total_transformation += decomposition_transformations

                if n_total_transformation == 0:
                    if missing_chars:
                        # list components that can help us build this transform.
                        for missing in missing_chars:
                            if missing not in missing_char_dict:
                                missing_char_dict[missing] = []
                            missing_char_dict[missing].append(target_char)
                    else:
                        missing_single_characters.append(target_char)

            n_decomposition_transformations += decomposition_transformations

            # if no direct transformation exists we have no data at all
            if n_total_transformation == 0:
                char_n_missing_transformations += 1
            elif n_total_transformation < self._max_samples:
                # we have data, just not enough
                char_n_underrepresented += 1

        sys.stdout.write('\n')

        if not self._quiet:
            _, default_encoding = locale.getdefaultlocale()
            total = n_exact_transformations + n_decomposition_transformations

            print 'Total characters: %d' % n_chars
            print 'Total transformation (instances): %d' % total

            print 'Characters with exact transformations: %d (%d%%)' \
                % (char_n_exact_transformations,
                   100 * char_n_exact_transformations / n_chars)
            print 'Total exact transformations: %d (%d%%)' \
                % (n_exact_transformations,
                   100 * n_exact_transformations / total)
            print 'Average exact transformations: %f' \
                % (1. * n_exact_transformations / n_chars)

            print 'Characters with decomposition transformations: %d (%d%%)' \
                % (char_n_decomposition_transformations,
                   100 * char_n_decomposition_transformations / n_chars)
            print 'Total decomposition transformations: %d (%d%%)' \
                % (n_decomposition_transformations,
                   100 * n_decomposition_transformations / total)
            print 'Average decomposition transformations: %f' \
                % (1. * n_decomposition_transformations / n_chars)

            print 'Characters missing transformations: %d (%d%%)' \
                % (char_n_missing_transformations,
                    100 * char_n_missing_transformations / n_chars)
            if self._max_samples > 1:
                print 'Characters with less than %d instances: %d (%d%%)' \
                    % (self._max_samples, char_n_underrepresented,
                        100 * char_n_underrepresented / n_chars)

            # missing single characters
            # Extend by those with components, that have a component with low
            #   productivity.
            in_domain_components = set(
                self._cjk.filterDomainCharacters(missing_char_dict.keys()))

            low_productivity_component_chars = []
            for component, chars in missing_char_dict.items():
                if component not in in_domain_components \
                    and len(chars) < self.MIN_COMPONENT_PRODUCTIVITY:
                    low_productivity_component_chars.extend(chars)
                    del missing_char_dict[component]
            missing_single_characters.extend(low_productivity_component_chars)

            print 'Missing single characters:',
            print ''.join(missing_single_characters).encode(default_encoding)

            # remove characters that we already placed in "single"
            _missing_single_characters = set(missing_single_characters)
            for component, chars in missing_char_dict.items():
                missing_char_dict[component] = list(
                    set(chars) - _missing_single_characters)
                if not missing_char_dict[component]:
                    del missing_char_dict[component]

            # missing components

            missingComponents = sorted(missing_char_dict.items(),
                key=lambda (x,y): len(y))
            missingComponents.reverse()

            in_domain_component_list = [(component, chars) \
                for component, chars in missingComponents \
                if component in in_domain_components]
            # only show "out-domain" components if they have productivity > 1
            out_domain_component_list = [(component, chars) \
                for component, chars in missingComponents \
                if component not in in_domain_components and len(chars) > 1]

            print 'Missing components: %d' % (len(in_domain_component_list) \
                + len(out_domain_component_list))
            print 'Missing in-domain components:',
            print ', '.join(['%s (%s)' % (component, ''.join(chars)) \
                for component, chars in in_domain_component_list])\
                .encode(default_encoding)
            print 'Missing out-domain components:',
            print ', '.join(['%s (%s)' % (component, ''.join(chars)) \
                for component, chars in out_domain_component_list])\
                .encode(default_encoding)

        return to_charcol

    def _get_source_character_lookup(self):
        if not hasattr(self, '_source_character_lookup'):
            self._source_character_lookup = {}

            charcol = self._get_charcol()
            if charcol.get_total_n_characters() == 0:
                raise TegakiBootstrapError("Empty input collection provided")

            for character in charcol.get_all_characters():
                char = character.get_utf8().decode('utf8')
                if char not in self._source_character_lookup:
                    self._source_character_lookup[char] = []
                self._source_character_lookup[char].append(character)

        return self._source_character_lookup

    def get_writings_from_decomposition(self, char, force_decomposition=False):
        writing_objects = []

        source_char_lookup = self._get_source_character_lookup()

        exact_transformations = 0
        if (not force_decomposition and char in source_char_lookup):
            writing_objects.extend([character.get_writing() \
                for character in source_char_lookup[char]])

        if (CharacterLookup.isRadicalChar(char)
            and char not in self.RADICALS_NON_VISUAL_EQUIVALENCE):
            try:
                equivChar = self._cjk.getRadicalFormEquivalentCharacter(char)
                if equivChar in source_char_lookup:
                    writing_objects.extend([character.get_writing()
                        for character in source_char_lookup[equivChar]])
            except UnsupportedError:
                pass

        # add decompositions, limit to upper bound max_samples
        missing_chars = []
        if len(writing_objects) < self._max_samples:
            decompositions = self._cjk.getDecompositionEntries(char)
            for decomposition in decompositions:
                writing_objs, _, missing = self._get_writing_from_entry(
                    decomposition)
                if not writing_objs:
                    missing_chars.extend(missing)
                writing_objects.extend(writing_objs)

                if len(writing_objects) >= self._max_samples:
                    break

        writing_objects = writing_objects[:self._max_samples]

        return writing_objects, missing_chars

    def _get_writing_from_entry(self, decomposition, index=0):
        """Goes through a decomposition"""
        if type(decomposition[index]) != type(()):
            # IDS operator
            character = decomposition[index]
            writing_objects_list = []
            missing_chars = []
            if CharacterLookup.isBinaryIDSOperator(character):
                # check for IDS operators we can't make any order
                # assumption about
                if character in [u'⿴', u'⿻', u'⿷']:
                    return [], index, []
                else:
                    # Get stroke order for both components
                    for _ in range(0, 2):
                        writing_objs, index, missing \
                            = self._get_writing_from_entry(decomposition,
                                index+1)
                        if not writing_objs:
                            missing_chars.extend(missing)
                            #return [], index
                        writing_objects_list.append(writing_objs)

            elif CharacterLookup.isTrinaryIDSOperator(character):
                # Get stroke order for three components
                for _ in range(0, 3):
                    writing_objs, index, missing = self._get_writing_from_entry(
                        decomposition, index+1)
                    if not writing_objs:
                        missing_chars.extend(missing)
                        #return [], index
                    writing_objects_list.append(writing_objs)
            else:
                assert False, 'not an IDS character'

            # merge
            writing_objects = []
            if not missing_chars:
                compound_writings = TegakiBootstrap.cross(*writing_objects_list)
                # shuffle to provide more variation
                random.shuffle(compound_writings)
                compound_writings = compound_writings[:self._max_samples]

                for writing_objs in compound_writings:
                    writing = self.merge_writing_objects(character,
                        writing_objs)
                    writing_objects.append(writing)
            return writing_objects, index, missing_chars
        else:
            # no IDS operator but character
            char, _ = decomposition[index]
            # if the character is unknown or there is none raise
            if char == u'?':
                return [], index, []
            else:
                # recursion
                writing_objs, missing_chars \
                    = self.get_writings_from_decomposition(char)
                if not writing_objs and not missing_chars:
                    if (CharacterLookup.isRadicalChar(char)
                        and char not in self.RADICALS_NON_VISUAL_EQUIVALENCE):
                        try:
                            char = self._cjk.getRadicalFormEquivalentCharacter(
                                char)
                        except UnsupportedError:
                            pass
                    missing_chars = [char]
                return writing_objs, index, missing_chars

        assert False

    @classmethod
    def merge_writing_objects(cls, ids_char, writing_objects):
        if ids_char not in cls.COMPONENT_TRANSFORMATION:
            raise ValueError("Not supported") # [u'⿴', u'⿻', u'⿷']

        assert (CharacterLookup.isBinaryIDSOperator(ids_char) \
            and len(writing_objects) == 2) \
            or (CharacterLookup.isTrinaryIDSOperator(ids_char) \
            and len(writing_objects) == 3)
        assert len(cls.COMPONENT_TRANSFORMATION[ids_char]) \
            == len(writing_objects)

        transformations = cls.COMPONENT_TRANSFORMATION[ids_char]
        # reverse transformations where inner part is written first
        if ids_char in [u'⿺', u'⿶']:
            writing_objects.reverse()
            transformations = transformations[:]
            transformations.reverse()

        resultingWriting = Writing()
        for idx, transformation in enumerate(transformations):
            xrate, yrate, dx, dy = transformation

            obj = writing_objects[idx].copy()
            obj.resize(xrate, yrate)
            obj.move_rel(dx * obj.get_width(), dy * obj.get_height())
            obj.resize(resultingWriting.get_width() / obj.get_width(),
                resultingWriting.get_height() / obj.get_height())
            for s in obj.get_strokes(True):
                resultingWriting.append_stroke(s)

        return resultingWriting

    @staticmethod
    def cross(*args):
        ans = [[]]
        for arg in args:
            ans = [x+[y] for x in ans for y in arg]
        return ans


parser = OptionParser(usage="usage: %prog [options] [output-path]",
                      version="%prog " + VERSION)

parser.add_option("-d", "--directory",
                  action="append", type="string", dest="directories",
                  default=[],
                  help="Directory containing individual XML character files")
parser.add_option("-c", "--charcol",
                  action="append", type="string", dest="charcols",
                  default=[],
                  help="character collection XML files")
parser.add_option("-t", "--tomoe-dict",
                  action="append", type="string", dest="tomoe",
                  default=[],
                  help="Tomoe XML dictionary files")
parser.add_option("-k", "--kuchibue",
                  action="append", type="string", dest="kuchibue",
                  default=[],
                  help="Kuchibue unipen database")


parser.add_option("-i", "--include",
                  action="append", type="string", dest="include",
                  default=[],
                  help="File containing characters to include")
parser.add_option("-e", "--exclude",
                  action="append", type="string", dest="exclude",
                  default=[],
                  help="File containing characters to exclude")
parser.add_option("-m", "--max-samples",
                  type="int", dest="max_samples", default=1,
                  help="Maximum number of samples per character")


parser.add_option("-l", "--locale",
                  type="string", dest="locale",
                  default='T',
                  help="Character locale of target characters")
parser.add_option("--domain",
                  type="string", dest="character_domain",
                  default='Unicode',
                  help="Character domain of target characters")


parser.add_option("-x", "--no-exact", dest="no_exact_transformation",
                  action="store_true",
                  help="Don't use exact transformations" \
                    + ", use only decompositions")
parser.add_option("-q", "--quiet", dest="quiet",
                  action="store_true",
                  help="Don't print any statistics")

(options, args) = parser.parse_args()

try:
    TegakiBootstrap(options, args).run()
except TegakiBootstrapError, e:
    sys.stderr.write(str(e) + "\n\n")
    parser.print_help()
    sys.exit(1)