/usr/share/espeak-gui/src/language.py

# -*- coding: utf-8 -*-
#
# Graphical interface for the eSpeak speech synthesizer
#
# Copyright © 2010-2012 Siegfried-Angel Gevatter Pujals <siegfried@gevatter.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import os
import locale
import ctypes
from xdg import BaseDirectory

class LanguageIdentifier:

    LIBTEXTCAT_DIR = '/usr/share/libtextcat/'

    _libtextcat = None
    _handle = None

    def __init__(self):
        self._conf_file = self._get_list_filename()
        self._libtextcat = ctypes.CDLL('libtextcat.so.0', ctypes.RTLD_GLOBAL)
        self._libtextcat.textcat_Init.restype = ctypes.c_void_p
        self._libtextcat.textcat_Classify.restype = ctypes.c_char_p
        self._libtextcat.textcat_Version.restype = ctypes.c_char_p
        self._handle = self._libtextcat.textcat_Init(self._conf_file)
        self._languages = self._get_language_list(self._conf_file)

    def __del__(self):
        self._libtextcat.textcat_Done(self._handle)

    def identify(self, text):
        result = self._libtextcat.textcat_Classify(self._handle, text, len(text))
        if result in ('SHORT', 'UNKNOWN'):
            return []
        return result.replace('[', '').split(']')[:-1]

    @classmethod
    def _get_list_filename(cls):
        #conffile = os.path.join(cls.LIBTEXTCAT_DIR, 'conf.txt')
        #if os.path.isfile(conffile):
        #    return conffile
        return cls._generate_list(os.path.join(cls.LIBTEXTCAT_DIR, 'LM'))

    @staticmethod
    def _generate_list(path):
        tmpfilename = os.path.join(BaseDirectory.xdg_cache_home,
            'espeak-gui/libtextcat_conf.txt')
        try:
            os.makedirs(os.path.dirname(tmpfilename))
        except OSError:
            pass
        with open(tmpfilename, 'w') as tempfile:
            for filename in (os.path.join(path, name) for name in os.listdir(path)):
                print >>tempfile, "%s\t%s" % (filename,
                    filename.split('/')[-1].split('.', 1)[0])
        return tmpfilename
    
    @staticmethod
    def _get_language_list(filename):
        with open(filename, 'r') as conffile:
            lines = conffile.read().split('\n')
            lines = map(lambda x: x.split('#')[0].strip(), lines)
            lines = [''.join(line.split('\t')[1:]) for line in lines]
            return filter(None, lines)
        return None
    
    @property
    def languages(self):
        return self._languages

class LanguageManager:

    _identifier = None

    _languages_autodetect = None
    _languages_other = None

    _lang2identifier = None         # for espeak-only stuff
    _lang2iso = None                # for libtextcat -> espeak conversion
    _supported_languages = None
    _last_language = None

    def __init__(self, espeak_voices):
        try:
            self._identifier = LanguageIdentifier()
        except Exception, e:
            print "Couldn't initialize language identifier: %s" % e
        
        self._languages_autodetect = []
        self._languages_other = []
        self._lang2identifier = {}
        self._lang2iso = {}
        self._supported_languages = []
        
        for voice in espeak_voices:
            voice_name = voice.name.lower()
            self._register_language(voice_name, voice.identifier)
            if self._identifier and voice_name in self._identifier.languages:
                self._languages_autodetect.append(voice)
                self._supported_languages.append(voice_name)
            else:
                self._languages_other.append(voice)

    def _register_language(self, name, identifier):
        self._lang2identifier[name] = identifier
        iso_name = name
        if name.endswith('-test'):
            iso_name = name[:-5]
        self._lang2iso[iso_name] = identifier.split('/', 1)[-1]

    def get_languages(self, autodetect=None):
        result = []
        if autodetect or autodetect is None:
            result.extend(self._languages_autodetect)
        if not autodetect or autodetect is None:
            result.extend(self._languages_other)
        return result

    def get_default_language(self):
        try:
            language = locale.getlocale()[0].split('_')[0]
        except Exception:
            pass
        
        if language in self._lang2identifier.values():
            return language
        return 'en'

    def autodetect(self, text):
        if not self._identifier:
            # not supported
            return LanguageGuess([], self.get_default_language())
        candidates = self._identifier.identify(text)
        candidates = [x for x in candidates if x in self._supported_languages]
        return LanguageGuess(candidates, self._choose_language(candidates))

    @property
    def autodetect_supported(self):
        return self._identifier

    def _get_iso_from_lang(self, name):
        try:
            return self._lang2iso[name.lower()]
        except KeyError:
            print 'Can\'t translate "%s" to ISO code.' % name
            return None

    def _choose_language(self, candidates):
        language = None
        if candidates:
            # We use to first guessed candidate for which we have an identifier
            langs = list(candidates)
            langs.reverse()
            while langs and not language:
                language = self._get_iso_from_lang(langs.pop())
        elif self._last_language:
            # If that fails, we use the previous guessed language 
            language = self._last_language
        else:
            # If that fails too, we use the default language
            language = self.get_default_language()
        
        self._last_language = language
        return language

class LanguageGuess:

    _candidates = None
    _language = None

    def __init__(self, candidates, language):
        self._candidates = candidates
        self._language = language

    @property
    def good(self):
        return self._candidates

    @staticmethod
    def _cut_string(string, length):
        if len(string) <= length:
            return string
        return "%s..." % string[:length-3]

    def get_display_name(self):
        label = ', '.join(x.capitalize() for x in self._candidates)
        return self._cut_string(label, 25)

    def get_language(self):
        return self._language

if __name__ == '__main__':
    l = LanguageIdentifier()
    # English
    print l.identify("Over the coming weeks, we will be making two ' \
        'important updates that will impact how you interact...")
    # Catalan
    print l.identify('Hola, aquí tens la nova versió del programa. Caldria ' \
        'afegir un nou paràgraf al final de la descripció. ' \
        'Qualsevol cosa em dius. Moltes gràcies!')
espeak-gui 0.4-3 / usr / share / espeak-gui / src / language.py