This file is indexed.

/usr/include/tesseract/ambigs.h is in libtesseract-dev 3.02.01-6.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
///////////////////////////////////////////////////////////////////////
// File:        ambigs.h
// Description: Constants, flags, functions for dealing with
//              ambiguities (training and recognition).
// Author:      Daria Antonova
// Created:     Mon Aug 23 11:26:43 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CCUTIL_AMBIGS_H_
#define TESSERACT_CCUTIL_AMBIGS_H_

#include "elst.h"
#include "tprintf.h"
#include "unichar.h"
#include "unicharset.h"
#include "genericvector.h"

#define MAX_AMBIG_SIZE    10

namespace tesseract {

typedef GenericVector<UNICHAR_ID> UnicharIdVector;

static const int kUnigramAmbigsBufferSize = 1000;
static const char kAmbigNgramSeparator[] = { ' ', '\0' };
static const char kAmbigDelimiters[] = "\t ";
static const char kIllegalMsg[] =
  "Illegal ambiguity specification on line %d\n";
static const char kIllegalUnicharMsg[] =
  "Illegal unichar %s in ambiguity specification\n";

enum AmbigType {
  NOT_AMBIG,        // the ngram pair is not ambiguous
  REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
  DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
  SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
  CASE_AMBIG,       // this is a case ambiguity (1-1)

  AMBIG_TYPE_COUNT  // number of enum entries
};

// A collection of utility functions for arrays of UNICHAR_IDs that are
// terminated by INVALID_UNICHAR_ID.
class UnicharIdArrayUtils {
 public:
  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
  // less than length of array2, if any array1[i] is less than array2[i].
  // Returns 0 if the arrays are equal, 1 otherwise.
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
  static inline int compare(const UNICHAR_ID array1[],
                            const UNICHAR_ID array2[]) {
    const UNICHAR_ID *ptr1 = array1;
    const UNICHAR_ID *ptr2 = array2;
    while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
      if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
      ++ptr1;
      ++ptr2;
    }
    if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
    return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
  }

  // Look uid in the vector of uids.  If found, the index of the matched
  // element is returned.  Otherwise, it returns -1.
  static inline int find_in(const UnicharIdVector& uid_vec,
                            const UNICHAR_ID uid) {
    for (int i = 0; i < uid_vec.size(); ++i)
      if (uid_vec[i] == uid) return i;
    return -1;
  }

  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
  // and that dst has enough space for all the elements from src.
  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
    int i = 0;
    do {
      dst[i] = src[i];
    } while (dst[i++] != INVALID_UNICHAR_ID);
    return i - 1;
  }

  // Prints unichars corresponding to the unichar_ids in the given array.
  // The function assumes that array is terminated by INVALID_UNICHAR_ID.
  static inline void print(const UNICHAR_ID array[],
                           const UNICHARSET &unicharset) {
    const UNICHAR_ID *ptr = array;
    if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
    while (*ptr != INVALID_UNICHAR_ID) {
      tprintf("%s ", unicharset.id_to_unichar(*ptr++));
    }
    tprintf("( ");
    ptr = array;
    while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
    tprintf(")\n");
  }
};

// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
// start with the same unichar (e.g. r->t rn->m rr1->m).
class AmbigSpec : public ELIST_LINK {
 public:
  AmbigSpec();
  ~AmbigSpec() {}

  // Comparator function for sorting AmbigSpec_LISTs. The lists will
  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
  // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
  static int compare_ambig_specs(const void *spec1, const void *spec2) {
    const AmbigSpec *s1 =
      *reinterpret_cast<const AmbigSpec * const *>(spec1);
    const AmbigSpec *s2 =
      *reinterpret_cast<const AmbigSpec * const *>(spec2);
    return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
  }

  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
  UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
  UNICHAR_ID correct_ngram_id;
  AmbigType type;
  int wrong_ngram_size;
};
ELISTIZEH(AmbigSpec);

// AMBIG_TABLE[i] stores a set of ambiguities whose
// wrong ngram starts with unichar id i.
typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;

class UnicharAmbigs {
 public:
  UnicharAmbigs() {}
  ~UnicharAmbigs() {
    replace_ambigs_.delete_data_pointers();
    dang_ambigs_.delete_data_pointers();
    one_to_one_definite_ambigs_.delete_data_pointers();
  }

  const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
  const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }

  // Fills in two ambiguity tables (replaceable and dangerous) with information
  // read from the ambigs file. An ambiguity table is an array of lists.
  // The array is indexed by a class id. Each entry in the table provides
  // a list of potential ambiguities which can start with the corresponding
  // character. For example the ambiguity "rn -> m", would be located in the
  // table at index of unicharset.unichar_to_id('r').
  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
  // of the wrong part of the ambiguity and each entry contains a vector of
  // unichar ids that are ambiguous to it.
  void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level,
                         bool use_ambigs_for_adaption, UNICHARSET *unicharset);

  // Returns definite 1-1 ambigs for the given unichar id.
  inline const UnicharIdVector *OneToOneDefiniteAmbigs(
      UNICHAR_ID unichar_id) const {
    if (one_to_one_definite_ambigs_.empty()) return NULL;
    return one_to_one_definite_ambigs_[unichar_id];
  }

  // Returns a pointer to the vector with all unichar ids that appear in the
  // 'correct' part of the ambiguity pair when the given unichar id appears
  // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
  // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
  // m will return a pointer to a vector with unichar ids of r,n,i.
  inline const UnicharIdVector *AmbigsForAdaption(
      UNICHAR_ID unichar_id) const {
    if (ambigs_for_adaption_.empty()) return NULL;
    return ambigs_for_adaption_[unichar_id];
  }

  // Similar to the above, but return the vector of unichar ids for which
  // the given unichar_id is an ambiguity (appears in the 'wrong' part of
  // some ambiguity pair).
  inline const UnicharIdVector *ReverseAmbigsForAdaption(
      UNICHAR_ID unichar_id) const {
    if (reverse_ambigs_for_adaption_.empty()) return NULL;
    return reverse_ambigs_for_adaption_[unichar_id];
  }

 private:

  bool ParseAmbiguityLine(int line_num, int version, int debug_level,
                          const UNICHARSET &unicharset, char *buffer,
                          int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
                          int *ReplacementAmbigPartSize,
                          char *ReplacementString, int *type);
  void InsertIntoTable(UnicharAmbigsVector &table,
                       int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
                       int ReplacementAmbigPartSize,
                       const char *ReplacementString, int type,
                       AmbigSpec *ambig_spec, UNICHARSET *unicharset);
  UnicharAmbigsVector dang_ambigs_;
  UnicharAmbigsVector replace_ambigs_;
  GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
  GenericVector<UnicharIdVector *> ambigs_for_adaption_;
  GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_;
};

}  // namespace tesseract

#endif  // TESSERACT_CCUTIL_AMBIGS_H_