/usr/include/kytea/kytea.h is in libkytea-dev 0.4.6+dfsg-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | /*
* Copyright 2009-2010, KyTea Development Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef KYTEA_H__
#define KYTEA_H__
#include <kytea/kytea-config.h>
#include <kytea/kytea-struct.h>
#include <vector>
namespace kytea {
class KyteaTest;
class StringUtil;
class KyteaConfig;
template <class T> class Dictionary;
class ModelTagEntry;
class ProbTagEntry;
class KyteaModel;
class KyteaLM;
class FeatureIO;
// a class representing the main analyzer
class Kytea {
private:
friend class KyteaTest;
typedef unsigned FeatureId;
typedef std::vector<KyteaSentence*> Sentences;
typedef std::vector< std::vector< FeatureId > > SentenceFeatures;
StringUtil* util_;
KyteaConfig* config_;
Dictionary<ModelTagEntry> * dict_;
Sentences sentences_;
// Values for the word segmentation models
KyteaModel* wsModel_;
Dictionary<ProbTagEntry>* subwordDict_;
std::vector<KyteaLM*> subwordModels_;
std::vector<KyteaModel*> globalMods_;
std::vector< std::vector<KyteaString> > globalTags_;
std::vector<unsigned> dictFeats_;
std::vector<KyteaString> charPrefixes_, typePrefixes_;
FeatureIO* fio_;
public:
///////////////////////////////////////////////////////////////////
// API functions //
///////////////////////////////////////////////////////////////////
// Read a model from the file fileName. Character encoding,
// settings, and other information will be read automatically.
void readModel(const char* fileName);
// Writes a model representing the current instance to the
// file fileName. The model will be of the type specified
// by the parameters in KyteaConfig
void writeModel(const char* fileName);
// Calculate the word segmentation for a sentence
void calculateWS(KyteaSentence & sent);
// Calculate the tagss for a sentence
void calculateTags(KyteaSentence & sent, int lev);
// Calculate the unknown pronunciation for a single unknown word
void calculateUnknownTag(KyteaWord & str, int lev);
// Get the string utility class that allows you to map to/from
// Kyteas internal string representation (using
// mapString/showString)
StringUtil* getStringUtil() { return config_->getStringUtil(); }
// Get the the configuration of this isntance of KyTea
KyteaConfig* getConfig() { return config_; }
// These are available for convenience, and require you to set
// the appropriate settings in KyteaConfig first
// "trainAll" performs full training of Kytea from start to finish
void trainAll();
// "analyze" loads models, and analyzes the full corpus input
void analyze();
///////////////////////////////////////////////////////////////////
// Constructor/Destructor //
///////////////////////////////////////////////////////////////////
void init();
Kytea() : config_(new KyteaConfig()) { init(); }
Kytea(KyteaConfig * config) : config_(config) { init(); }
~Kytea();
KyteaModel* getWSModel() { return wsModel_; }
// Set the word segmentation model and take control of it
void setWSModel(KyteaModel* model) { wsModel_ = model; }
// Set the dictionary and take control of it
template <class Entry>
void setDictionary(Dictionary<Entry> * dict);
///////////////////////////////////////////////////////////////////
// Functions used internally during Kytea training, testing etc. //
///////////////////////////////////////////////////////////////////
public:
void checkEqual(const Kytea & rhs);
private:
// functions to create dictionaries
void buildVocabulary();
// a function that checks to make sure that configuration is correct before
// training
void trainSanityCheck();
// functions for word segmentation
void trainWS();
void preparePrefixes();
unsigned wsDictionaryFeatures(const KyteaString & sent, SentenceFeatures & feat);
unsigned wsNgramFeatures(const KyteaString & sent, SentenceFeatures & feat, const std::vector<KyteaString> & prefixes, int n);
// functions for tagging
void trainLocalTags(int lev);
void trainGlobalTags(int lev);
unsigned tagNgramFeatures(const KyteaString & chars, std::vector<unsigned> & feat, const std::vector<KyteaString> & prefixes, KyteaModel * model, int n, int sc, int ec);
unsigned tagSelfFeatures(const KyteaString & self, std::vector<unsigned> & feat, const KyteaString & pref, KyteaModel * model);
unsigned tagDictFeatures(const KyteaString & surf, int lev, std::vector<unsigned> & myFeats, KyteaModel * model);
// Get matches of the dictionary for a single word in the form of
// { <x_1, y_1>, <x_2, y_2> }
// where x is the dictionary and y is the tag that exists in the dicitonary
std::vector<std::pair<int,int> > getDictionaryMatches(const KyteaString & str, int lev);
template <class Entry>
void addTag(typename Dictionary<Entry>::WordMap& allWords, const KyteaString & word, int lev, const KyteaString * tag, int dict);
template <class Entry>
void addTag(typename Dictionary<Entry>::WordMap& allWords, const KyteaString & word, const KyteaTag * tag, int dict);
template <class Entry>
void scanDictionaries(const std::vector<std::string> & dict, typename Dictionary<Entry>::WordMap & wordMap, KyteaConfig * config, StringUtil * util, bool saveIds = true);
// functions for unknown word PE
void trainUnk(int lev);
void buildFeatureLookups();
void analyzeInput();
std::vector<KyteaTag> generateTagCandidates(const KyteaString & str, int lev);
};
}
#endif
|