/usr/include/kytea/kytea-config.h is in libkytea-dev 0.4.6+dfsg-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | /*
* Copyright 2009, KyTea Development Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef KYTEA_CONFIG_H__
#define KYTEA_CONFIG_H__
namespace kytea {
class KyteaConfig;
}
#include <string>
#include <vector>
namespace kytea {
class StringUtil;
class KyteaConfig {
private:
// must be the same as CorpusIO::Format, not used directly because of cross-dependencies
typedef char CorpForm;
bool onTraining_;
unsigned debug_; // the debugging level
// 0 = silent (default for running)
// 1 = simple progress updates (default for training)
// 2 = detailed progress updates
// 3 = everything
StringUtil * util_; // a std::string utility to hold the encoding, etc
std::vector<std::string> corpora_; // corpora to read for training
std::vector<CorpForm> corpusFormats_; // the annotation of each corpus
std::vector<std::string> dicts_; // dictionaries to read
std::vector<std::string> subwordDicts_; // subword dictionaries to use for unknown estimation
std::string model_; // model file to write/read
char modelForm_; // model format (ModelIO::Format)
std::string input_, output_; // the file to input/output
CorpForm inputForm_, outputForm_; // the format/file to input/output to (default: stdout, full)
std::string featIn_, featOut_;
std::ostream* featStr_;
bool doWS_, doTags_, doUnk_;
std::vector<bool> doTag_;
// feature options
bool addFeat_; // whether or not to add newly found features
double confidence_; // when using probability, only annotate or use values that
// are at least this confident (default: 0=deterministic)
// charW: the number of characters on either side of the boundary to use (default: 3)
// charN: the maximum n-gram order of characters to use (default: 3)
// typeW: the number of character types on either side of the boundary to use (default: 3)
// typeN: the maximum n-gram order of types to use (default: 3)
// dictN: all dictionary words over this are treated as equal frequency (default: 4)
char charW_, charN_, typeW_, typeN_, dictN_;
// unknown word arguments
// unkN: the n-gram length of the unknown word spelling model
// defTag: a default tag to use when no candidates were generated
// unkTag: a tag to append after every word with no tag in the dictionary
char unkN_;
unsigned unkBeam_;
std::string defTag_;
std::string unkTag_;
// liblinear training values
double bias_; // the bias used for liblinear training
double eps_; // the termination epsilon
double cost_; // the cost for the SVM or LR training
int solverType_; // the type of solver to be used
// extra arguments, should be input/output for the analyzer
std::vector<std::string> args_;
// set the type of the input corpus
void setIOFormat(const char* str, CorpForm & cf);
// formatting tags
std::string wordBound_, tagBound_, elemBound_, unkBound_, noBound_, hasBound_, skipBound_, escape_;
// hard constraint on character divisions. can be used for digits, etc.
std::string wsConstraint_;
// the number of tag levels
int numTags_;
std::vector<bool> global_;
// tagMax: the maximum number of tags to return for a word
unsigned tagMax_;
// check argument legality
void ch(const char * n, const char* v);
public:
KyteaConfig();
KyteaConfig(const KyteaConfig & rhs);
~KyteaConfig();
void addCorpus(const std::string & corp, CorpForm format);
void addDictionary(const std::string & corp);
void addSubwordDict(const std::string & corp);
// parse command line arguments
void parseTrainCommandLine(int argc, const char ** argv);
void parseRunCommandLine(int argc, const char ** argv);
void printUsage();
void printVersion();
// parse a single argument
// the value argument can be null
// return 1 if the value was used 0 if not
unsigned parseTrainArg(const char * n, const char * v);
unsigned parseRunArg(const char * n, const char * v);
// getters
const std::vector<std::string> & getCorpusFiles() const { return corpora_; }
const std::vector<CorpForm> & getCorpusFormats() const { return corpusFormats_; }
const std::vector<std::string> & getDictionaryFiles() const { return dicts_; }
const std::vector<std::string> & getSubwordDictFiles() const { return subwordDicts_; }
const std::string & getModelFile();
const char getModelFormat() const { return modelForm_; }
const unsigned getDebug() const { return debug_; }
StringUtil * getStringUtil() { return util_; }
const StringUtil * getStringUtil() const { return util_; }
const CorpForm getInputFormat() const { return inputForm_; }
const CorpForm getOutputFormat() const { return outputForm_; }
const std::string & getFeatureIn() const { return featIn_; }
const std::string & getFeatureOut() const { return featOut_; }
const bool getWriteFeatures() const { return featOut_.length() > 0; }
const char getCharN() const { return charN_; }
const char getCharWindow() const { return charW_; }
const char getTypeN() const { return typeN_; }
const char getTypeWindow() const { return typeW_; }
const char getDictionaryN() const { return dictN_; }
const char getUnkN() const { return unkN_; }
const unsigned getTagMax() const { return tagMax_; }
const unsigned getUnkBeam() const { return unkBeam_; }
const std::string & getUnkTag() const { return unkTag_; }
const std::string & getDefaultTag() const { return defTag_; }
const std::string & getWsConstraint() const { return wsConstraint_; }
const double getBias() const { return bias_; }
const double getEpsilon() const { return eps_; }
const double getCost() const { return cost_; }
const int getSolverType() const { return solverType_; }
const bool getDoWS() const { return doWS_; }
const bool getDoUnk() const { return doUnk_; }
const bool getDoTags() const { return doTags_; }
const bool getDoTag(int i) const { return doTags_ && (i >= (int)doTag_.size() || doTag_[i]); }
const char* getWordBound() const { return wordBound_.c_str(); }
const char* getTagBound() const { return tagBound_.c_str(); }
const char* getElemBound() const { return elemBound_.c_str(); }
const char* getUnkBound() const { return unkBound_.c_str(); }
const char* getNoBound() const { return noBound_.c_str(); }
const char* getHasBound() const { return hasBound_.c_str(); }
const char* getSkipBound() const { return skipBound_.c_str(); }
const char* getEscape() const { return escape_.c_str(); }
const double getConfidence() const { return confidence_; }
const char getEncoding() const;
const char* getEncodingString() const;
int getNumTags() const { return numTags_; }
bool getGlobal(int i) const { return i < (int)global_.size() && global_[i]; }
const std::vector<std::string> & getArguments() const { return args_; }
// setters
void setDebug(unsigned debug) { debug_ = debug; }
void setModelFile(const char* file) { model_ = file; }
void setModelFormat(char mf) { modelForm_ = mf; }
void setEpsilon(double v) { eps_ = v; }
void setCost(double v) { cost_ = v; }
void setBias(bool v) { bias_ = (v?1.0f:-1.0f); }
void setSolverType(int v) { solverType_ = v; }
void setCharWindow(char v) { charW_ = v; }
void setCharN(char v) { charN_ = v; }
void setTypeWindow(char v) { typeW_ = v; }
void setTypeN(char v) { typeN_ = v; }
void setDictionaryN(char v) { dictN_ = v; }
void setUnkN(char v) { unkN_ = v; }
void setTagMax(unsigned v) { tagMax_ = v; }
void setUnkBeam(unsigned v) { unkBeam_ = v; }
void setUnkTag(const std::string & v) { unkTag_ = v; }
void setUnkTag(const char* v) { unkTag_ = v; }
void setDefaultTag(const std::string & v) { defTag_ = v; }
void setDefaultTag(const char* v) { defTag_ = v; }
void setOnTraining(bool v) { onTraining_ = v; }
void setDoWS(bool v) { doWS_ = v; }
void setDoUnk(bool v) { doUnk_ = v; }
void setDoTags(bool v) { doTags_ = v; }
void setDoTag(int i, bool v) {
if(i >= (int)doTag_.size()) doTag_.resize(i+1,true);
doTag_[i] = v;
}
void setInputFormat(CorpForm v) { inputForm_ = v; }
void setWordBound(const char* v) { wordBound_ = v; }
void setTagBound(const char* v) { tagBound_ = v; }
void setElemBound(const char* v) { elemBound_ = v; }
void setUnkBound(const char* v) { unkBound_ = v; }
void setNoBound(const char* v) { noBound_ = v; }
void setHasBound(const char* v) { hasBound_ = v; }
void setSkipBound(const char* v) { skipBound_ = v; }
void setEscape(const char* v) { escape_ = v; }
void setNumTags(int v) { numTags_ = v; }
void setGlobal(int v) { if((int)global_.size() <= v) global_.resize(v+1,false); global_[v] = true; }
void setFeatureIn(const std::string & featIn) { featIn_ = featIn; }
void setFeatureOut(const std::string & featOut) { featOut_ = featOut; }
void setWsConstraint(const std::string & wsConstraint) { wsConstraint_ = wsConstraint; }
std::ostream * getFeatureOutStream();
void closeFeatureOutStream();
// set the encoding of the StringUtil class and reset all the IOs
void setEncoding(const char* str);
};
}
#endif
|