/usr/bin/tegaki-stats

#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2009 The Tegaki project contributors
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

# Contributors to this file:
# - Mathieu Blondel

import sys
import os
from optparse import OptionParser

from tegaki.charcol import CharacterCollection
from tegakitools.charcol import *

VERSION = '0.3.1'

class TegakiStatsError(Exception):
    pass

class TegakiStats(object):
    """
    Show various statistics about a character collection.
    """

    def __init__(self, options, args):
        self._directories = options.directories
        self._charcols = options.charcols
        self._databases = options.databases
        self._tomoe = options.tomoe
        self._kuchibue = options.kuchibue
        self._include = options.include
        self._exclude = options.exclude
        self._max_samples = options.max_samples
        self._verbosity_level = options.verbosity_level

    def run(self):
        charcol = get_aggregated_charcol(
                        ((TYPE_CHARCOL, self._charcols),
                         (TYPE_CHARCOL_DB, self._databases),
                         (TYPE_DIRECTORY, self._directories),
                         (TYPE_TOMOE, self._tomoe),
                         (TYPE_KUCHIBUE, self._kuchibue)))


        charcol.include_characters_from_files(self._include)
        charcol.exclude_characters_from_files(self._exclude)

        # max samples
        if self._max_samples:
            charcol.remove_samples(keep_at_most=self._max_samples)

        if charcol.get_total_n_characters() == 0:
            raise TegakiStatsError, "No character collection provided " + \
                        "or character collections are empty!"

        print "Number of sets: %d" % charcol.get_n_sets()

        samp_by_class = self._get_samples_by_class(charcol)
        # the number of sets and the number of different characters (classes)
        # are the same if sets are used to group characters by class
        print "Number of different characters: ", len(samp_by_class)

        print "Total number of samples: ", charcol.get_total_n_characters()

        print "Total number of strokes: ", charcol.get_total_n_strokes()

        n_samples = samp_by_class.values()
        avg = float(sum(n_samples)) / len(n_samples)
        print "Average number of samples per character/class: %0.2f" % avg

        if self._verbosity_level >= 2:
            print "\nNumber of samples for each character:"
            for utf8, n_chars in samp_by_class.items():
                print "%s: %d" % (utf8, n_chars)
            print "\n"

        classes_by_sc = self._get_classes_by_stroke_count(charcol)
        n_classes = [len(classes_by_sc[k]) for k in classes_by_sc.keys()]
        avg = float(sum(n_classes)) / len(n_classes)
        print "Average number of chars/classes per stroke count: %0.2f" % avg

        if self._verbosity_level >= 1:
            print "\nCharacters found for each stroke count:"
            for sc in sorted(classes_by_sc.keys()):
                print "%d: %s" % (sc, ", ".join(classes_by_sc[sc]))
            print "\n"

        sc_by_class = self._get_stroke_counts_by_class(charcol)
        n_sc = [len(sc_by_class[k]) for k in sc_by_class.keys()]
        avg = float(sum(n_sc)) / len(n_sc)
        print "Average number of stroke counts per char/class: %0.2f" % avg

        if self._verbosity_level >= 2:
            print "\nDifferent stroke counts for each character:"
            for utf8, stroke_counts in sc_by_class.items():
                stroke_counts = [str(i) for i in sorted(stroke_counts)]
                print "%s: %s" % (utf8, ", ".join(stroke_counts))
            print "\n"

    def _get_samples_by_class(self, charcol):
        d = {}
        for set_name in charcol.get_set_list():
            for row in charcol.get_character_rows(set_name):
                utf8 = row['utf8'].encode("utf8")
                d[utf8] = d.get(utf8, 0) + 1
        return d

    def _get_classes_by_stroke_count(self, charcol):
        d = {}
        for set_name in charcol.get_set_list():
            for row in charcol.get_character_rows(set_name):
                n_strokes = row['n_strokes']
                utf8 = row['utf8'].encode("utf8")
                d[n_strokes] = d.get(n_strokes, [])
                if not utf8 in d[n_strokes]:
                    d[n_strokes].append(utf8)
        return d

    def _get_stroke_counts_by_class(self, charcol):
        d = {}
        for set_name in charcol.get_set_list():
            for row in charcol.get_character_rows(set_name):
                n_strokes = row['n_strokes']
                utf8 = row['utf8'].encode("utf8")
                d[utf8] = d.get(utf8, [])
                if not n_strokes in d[utf8]:
                    d[utf8].append(n_strokes)
        return d
        


parser = OptionParser(usage="usage: %prog [options]",
                      version="%prog " + VERSION)

parser.add_option("-d", "--directory",
                  action="append", type="string", dest="directories",
                  default=[],
                  help="Directory containing individual XML character files")
parser.add_option("-c", "--charcol",
                  action="append", type="string", dest="charcols",
                  default=[],
                  help="character collection XML files")
parser.add_option("-b", "--db",
                  action="append", type="string", dest="databases",
                  default=[],
                  help="character collection XML files")
parser.add_option("-t", "--tomoe-dict",
                  action="append", type="string", dest="tomoe",
                  default=[],
                  help="Tomoe XML dictionary files")
parser.add_option("-k", "--kuchibue",
                  action="append", type="string", dest="kuchibue",
                  default=[],
                  help="Kuchibue unipen database")

parser.add_option("-i", "--include",
                  action="append", type="string", dest="include",
                  default=[],
                  help="File containing characters to include")
parser.add_option("-e", "--exclude",
                  action="append", type="string", dest="exclude",
                  default=[],
                  help="File containing characters to exclude")
parser.add_option("-m", "--max-samples",
                  type="int", dest="max_samples",
                  help="Maximum number of samples per character")


parser.add_option("-v", "--verbosity-level",
                  type="int", dest="verbosity_level", default=0,
                  help="verbosity level between 0 and 2")

(options, args) = parser.parse_args()

try:
    TegakiStats(options, args).run()
except TegakiStatsError, e:
    sys.stderr.write(str(e) + "\n\n")
    parser.print_help()
    sys.exit(1)
python-tegakitools 0.3.1-1 / usr / bin / tegaki-stats