This file is indexed.

/usr/include/tesseract/unicharcompress.h is in libtesseract-dev 4.00~git2288-10f4998a-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
///////////////////////////////////////////////////////////////////////
// File:        unicharcompress.h
// Description: Unicode re-encoding using a sequence of smaller numbers in
//              place of a single large code for CJK, similarly for Indic,
//              and dissection of ligatures for other scripts.
// Author:      Ray Smith
// Created:     Wed Mar 04 14:45:01 PST 2015
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_

#include <unordered_map>

#include "serialis.h"
#include "strngs.h"
#include "unicharset.h"

namespace tesseract {

// Trivial class to hold the code for a recoded unichar-id.
class RecodedCharID {
 public:
  // The maximum length of a code.
  static const int kMaxCodeLen = 9;

  RecodedCharID() : self_normalized_(1), length_(0) {
    memset(code_, 0, sizeof(code_));
  }
  void Truncate(int length) { length_ = length; }
  // Sets the code value at the given index in the code.
  void Set(int index, int value) {
    code_[index] = value;
    if (length_ <= index) length_ = index + 1;
  }
  // Shorthand for setting codes of length 3, as all Hangul and Han codes are
  // length 3.
  void Set3(int code0, int code1, int code2) {
    length_ = 3;
    code_[0] = code0;
    code_[1] = code1;
    code_[2] = code2;
  }
  // Accessors
  int length() const { return length_; }
  int operator()(int index) const { return code_[index]; }

  // Writes to the given file. Returns false in case of error.
  bool Serialize(TFile* fp) const {
    if (fp->FWrite(&self_normalized_, sizeof(self_normalized_), 1) != 1)
      return false;
    if (fp->FWrite(&length_, sizeof(length_), 1) != 1) return false;
    if (fp->FWrite(code_, sizeof(code_[0]), length_) != length_) return false;
    return true;
  }
  // Reads from the given file. Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
  bool DeSerialize(TFile* fp) {
    if (fp->FRead(&self_normalized_, sizeof(self_normalized_), 1) != 1)
      return false;
    if (fp->FReadEndian(&length_, sizeof(length_), 1) != 1) return false;
    if (fp->FReadEndian(code_, sizeof(code_[0]), length_) != length_)
      return false;
    return true;
  }
  bool operator==(const RecodedCharID& other) const {
    if (length_ != other.length_) return false;
    for (int i = 0; i < length_; ++i) {
      if (code_[i] != other.code_[i]) return false;
    }
    return true;
  }
  // Hash functor for RecodedCharID.
  struct RecodedCharIDHash {
    size_t operator()(const RecodedCharID& code) const {
      size_t result = 0;
      for (int i = 0; i < code.length_; ++i) {
        result ^= code(i) << (7 * i);
      }
      return result;
    }
  };

 private:
  // True if this code is self-normalizing, ie is the master entry for indices
  // that map to the same code. Has boolean value, but int8_t for serialization.
  int8_t self_normalized_;
  // The number of elements in use in code_;
  int32_t length_;
  // The re-encoded form of the unichar-id to which this RecodedCharID relates.
  int32_t code_[kMaxCodeLen];
};

// Class holds a "compression" of a unicharset to simplify the learning problem
// for a neural-network-based classifier.
// Objectives:
// 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
//          a sequence of 3 codes with much fewer values.
//          This is achieved using the Jamo coding for Hangul and the Unicode
//          Radical-Stroke-index for Han.
// 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
//            as the unicode sequence (but coded in a more compact space).
// 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
//               and not significantly distinct shapes (quotes) togther, ie
//               represent the fi ligature as the f-i pair, and fold u+2019 and
//               friends all onto ascii single '
// 4 The null character and mapping to target activations:
//    To save horizontal coding space, the compressed codes are generally mapped
//    to target network activations without intervening null characters, BUT
//    in the case of ligatures, such as ff, null characters have to be included
//    so existence of repeated codes is detected at codebook-building time, and
//    null characters are embedded directly into the codes, so the rest of the
//    system doesn't need to worry about the problem (much). There is still an
//    effect on the range of ways in which the target activations can be
//    generated.
//
// The computed code values are compact (no unused values), and, for CJK,
// unique (each code position uses a disjoint set of values from each other code
// position). For non-CJK, the same code value CAN be used in multiple
// positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
// is the same code as is used for the single f.
class UnicharCompress {
 public:
  UnicharCompress();
  UnicharCompress(const UnicharCompress& src);
  ~UnicharCompress();
  UnicharCompress& operator=(const UnicharCompress& src);

  // The 1st Hangul unicode.
  static const int kFirstHangul = 0xac00;
  // The number of Hangul unicodes.
  static const int kNumHangul = 11172;
  // The number of Jamos for each of the 3 parts of a Hangul character, being
  // the Leading consonant, Vowel and Trailing consonant.
  static const int kLCount = 19;
  static const int kVCount = 21;
  static const int kTCount = 28;

  // Computes the encoding for the given unicharset. It is a requirement that
  // the file training/langdata/radical-stroke.txt have been read into the
  // input string radical_stroke_table.
  // Returns false if the encoding cannot be constructed.
  bool ComputeEncoding(const UNICHARSET& unicharset, int null_id,
                       STRING* radical_stroke_table);
  // Sets up an encoder that doesn't change the unichars at all, so it just
  // passes them through unchanged.
  void SetupPassThrough(const UNICHARSET& unicharset);
  // Sets up an encoder directly using the given encoding vector, which maps
  // unichar_ids to the given codes.
  void SetupDirect(const GenericVector<RecodedCharID>& codes);

  // Returns the number of different values that can be used in a code, ie
  // 1 + the maximum value that will ever be used by an RecodedCharID code in
  // any position in its array.
  int code_range() const { return code_range_; }

  // Encodes a single unichar_id. Returns the length of the code, (or zero if
  // invalid input), and the encoding itself in code.
  int EncodeUnichar(int unichar_id, RecodedCharID* code) const;
  // Decodes code, returning the original unichar-id, or
  // INVALID_UNICHAR_ID if the input is invalid.
  int DecodeUnichar(const RecodedCharID& code) const;
  // Returns true if the given code is a valid start or single code.
  bool IsValidFirstCode(int code) const { return is_valid_start_[code]; }
  // Returns a list of valid non-final next codes for a given prefix code,
  // which may be empty.
  const GenericVector<int>* GetNextCodes(const RecodedCharID& code) const {
    auto it = next_codes_.find(code);
    return it == next_codes_.end() ? NULL : it->second;
  }
  // Returns a list of valid final codes for a given prefix code, which may
  // be empty.
  const GenericVector<int>* GetFinalCodes(const RecodedCharID& code) const {
    auto it = final_codes_.find(code);
    return it == final_codes_.end() ? NULL : it->second;
  }

  // Writes to the given file. Returns false in case of error.
  bool Serialize(TFile* fp) const;
  // Reads from the given file. Returns false in case of error.

  bool DeSerialize(TFile* fp);

  // Returns a STRING containing a text file that describes the encoding thus:
  // <index>[,<index>]*<tab><UTF8-str><newline>
  // In words, a comma-separated list of one or more indices, followed by a tab
  // and the UTF-8 string that the code represents per line. Most simple scripts
  // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
  // and the Indic scripts will contain a many-to-many mapping.
  // See the class comment above for details.
  STRING GetEncodingAsString(const UNICHARSET& unicharset) const;

  // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
  // Note that the returned values are 0-based indices, NOT unicode Jamo.
  // Returns false if the input is not in the Hangul unicode range.
  static bool DecomposeHangul(int unicode, int* leading, int* vowel,
                              int* trailing);

 private:
  // Renumbers codes to eliminate unused values.
  void DefragmentCodeValues(int encoded_null);
  // Computes the value of code_range_ from the encoder_.
  void ComputeCodeRange();
  // Initializes the decoding hash_map from the encoder_ array.
  void SetupDecoder();
  // Frees allocated memory.
  void Cleanup();

  // The encoder that maps a unichar-id to a sequence of small codes.
  // encoder_ is the only part that is serialized. The rest is computed on load.
  GenericVector<RecodedCharID> encoder_;
  // Decoder converts the output of encoder back to a unichar-id.
  std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash>
      decoder_;
  // True if the index is a valid single or start code.
  GenericVector<bool> is_valid_start_;
  // Maps a prefix code to a list of valid next codes.
  // The map owns the vectors.
  std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
                     RecodedCharID::RecodedCharIDHash>
      next_codes_;
  // Maps a prefix code to a list of valid final codes.
  // The map owns the vectors.
  std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
                     RecodedCharID::RecodedCharIDHash>
      final_codes_;
  // Max of any value in encoder_ + 1.
  int code_range_;
};

}  // namespace tesseract.

#endif  // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_