/usr/include/tesseract/ambigs.h is in libtesseract-dev 3.02.01-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | ///////////////////////////////////////////////////////////////////////
// File: ambigs.h
// Description: Constants, flags, functions for dealing with
// ambiguities (training and recognition).
// Author: Daria Antonova
// Created: Mon Aug 23 11:26:43 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_AMBIGS_H_
#define TESSERACT_CCUTIL_AMBIGS_H_
#include "elst.h"
#include "tprintf.h"
#include "unichar.h"
#include "unicharset.h"
#include "genericvector.h"
#define MAX_AMBIG_SIZE 10
namespace tesseract {
typedef GenericVector<UNICHAR_ID> UnicharIdVector;
static const int kUnigramAmbigsBufferSize = 1000;
static const char kAmbigNgramSeparator[] = { ' ', '\0' };
static const char kAmbigDelimiters[] = "\t ";
static const char kIllegalMsg[] =
"Illegal ambiguity specification on line %d\n";
static const char kIllegalUnicharMsg[] =
"Illegal unichar %s in ambiguity specification\n";
enum AmbigType {
NOT_AMBIG, // the ngram pair is not ambiguous
REPLACE_AMBIG, // ocred ngram should always be substituted with correct
DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
CASE_AMBIG, // this is a case ambiguity (1-1)
AMBIG_TYPE_COUNT // number of enum entries
};
// A collection of utility functions for arrays of UNICHAR_IDs that are
// terminated by INVALID_UNICHAR_ID.
class UnicharIdArrayUtils {
public:
// Compares two arrays of unichar ids. Returns -1 if the length of array1 is
// less than length of array2, if any array1[i] is less than array2[i].
// Returns 0 if the arrays are equal, 1 otherwise.
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
static inline int compare(const UNICHAR_ID array1[],
const UNICHAR_ID array2[]) {
const UNICHAR_ID *ptr1 = array1;
const UNICHAR_ID *ptr2 = array2;
while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
++ptr1;
++ptr2;
}
if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
}
// Look uid in the vector of uids. If found, the index of the matched
// element is returned. Otherwise, it returns -1.
static inline int find_in(const UnicharIdVector& uid_vec,
const UNICHAR_ID uid) {
for (int i = 0; i < uid_vec.size(); ++i)
if (uid_vec[i] == uid) return i;
return -1;
}
// Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
// and that dst has enough space for all the elements from src.
static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
int i = 0;
do {
dst[i] = src[i];
} while (dst[i++] != INVALID_UNICHAR_ID);
return i - 1;
}
// Prints unichars corresponding to the unichar_ids in the given array.
// The function assumes that array is terminated by INVALID_UNICHAR_ID.
static inline void print(const UNICHAR_ID array[],
const UNICHARSET &unicharset) {
const UNICHAR_ID *ptr = array;
if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
while (*ptr != INVALID_UNICHAR_ID) {
tprintf("%s ", unicharset.id_to_unichar(*ptr++));
}
tprintf("( ");
ptr = array;
while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
tprintf(")\n");
}
};
// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
// start with the same unichar (e.g. r->t rn->m rr1->m).
class AmbigSpec : public ELIST_LINK {
public:
AmbigSpec();
~AmbigSpec() {}
// Comparator function for sorting AmbigSpec_LISTs. The lists will
// be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
// in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
static int compare_ambig_specs(const void *spec1, const void *spec2) {
const AmbigSpec *s1 =
*reinterpret_cast<const AmbigSpec * const *>(spec1);
const AmbigSpec *s2 =
*reinterpret_cast<const AmbigSpec * const *>(spec2);
return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
}
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
UNICHAR_ID correct_ngram_id;
AmbigType type;
int wrong_ngram_size;
};
ELISTIZEH(AmbigSpec);
// AMBIG_TABLE[i] stores a set of ambiguities whose
// wrong ngram starts with unichar id i.
typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
class UnicharAmbigs {
public:
UnicharAmbigs() {}
~UnicharAmbigs() {
replace_ambigs_.delete_data_pointers();
dang_ambigs_.delete_data_pointers();
one_to_one_definite_ambigs_.delete_data_pointers();
}
const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
// Fills in two ambiguity tables (replaceable and dangerous) with information
// read from the ambigs file. An ambiguity table is an array of lists.
// The array is indexed by a class id. Each entry in the table provides
// a list of potential ambiguities which can start with the corresponding
// character. For example the ambiguity "rn -> m", would be located in the
// table at index of unicharset.unichar_to_id('r').
// In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
// one_to_one_definite_ambigs_. This vector is also indexed by the class id
// of the wrong part of the ambiguity and each entry contains a vector of
// unichar ids that are ambiguous to it.
void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level,
bool use_ambigs_for_adaption, UNICHARSET *unicharset);
// Returns definite 1-1 ambigs for the given unichar id.
inline const UnicharIdVector *OneToOneDefiniteAmbigs(
UNICHAR_ID unichar_id) const {
if (one_to_one_definite_ambigs_.empty()) return NULL;
return one_to_one_definite_ambigs_[unichar_id];
}
// Returns a pointer to the vector with all unichar ids that appear in the
// 'correct' part of the ambiguity pair when the given unichar id appears
// in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
// m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
// m will return a pointer to a vector with unichar ids of r,n,i.
inline const UnicharIdVector *AmbigsForAdaption(
UNICHAR_ID unichar_id) const {
if (ambigs_for_adaption_.empty()) return NULL;
return ambigs_for_adaption_[unichar_id];
}
// Similar to the above, but return the vector of unichar ids for which
// the given unichar_id is an ambiguity (appears in the 'wrong' part of
// some ambiguity pair).
inline const UnicharIdVector *ReverseAmbigsForAdaption(
UNICHAR_ID unichar_id) const {
if (reverse_ambigs_for_adaption_.empty()) return NULL;
return reverse_ambigs_for_adaption_[unichar_id];
}
private:
bool ParseAmbiguityLine(int line_num, int version, int debug_level,
const UNICHARSET &unicharset, char *buffer,
int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int *ReplacementAmbigPartSize,
char *ReplacementString, int *type);
void InsertIntoTable(UnicharAmbigsVector &table,
int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int ReplacementAmbigPartSize,
const char *ReplacementString, int type,
AmbigSpec *ambig_spec, UNICHARSET *unicharset);
UnicharAmbigsVector dang_ambigs_;
UnicharAmbigsVector replace_ambigs_;
GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
GenericVector<UnicharIdVector *> ambigs_for_adaption_;
GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_;
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_AMBIGS_H_
|