/usr/include/tesseract/ratngs.h is in libtesseract-dev 3.02.01-6.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 | /**********************************************************************
* File: ratngs.h (Formerly ratings.h)
* Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
* Author: Ray Smith
* Created: Thu Apr 23 11:40:38 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef RATNGS_H
#define RATNGS_H
#include <assert.h>
#include "clst.h"
#include "genericvector.h"
#include "notdll.h"
#include "unichar.h"
#include "unicharset.h"
#include "werd.h"
class BLOB_CHOICE: public ELIST_LINK
{
public:
BLOB_CHOICE() {
unichar_id_ = INVALID_UNICHAR_ID;
fontinfo_id_ = -1;
fontinfo_id2_ = -1;
rating_ = MAX_FLOAT32;
certainty_ = -MAX_FLOAT32;
script_id_ = -1;
language_model_state_ = NULL;
min_xheight_ = 0;
max_xheight_ = 0;
adapted_ = false;
}
BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
float src_rating, // rating
float src_cert, // certainty
inT16 src_fontinfo_id, // font
inT16 src_fontinfo_id2, // 2nd choice font
int script_id, // script
inT16 min_xheight, // min xheight in image pixel units
inT16 max_xheight, // max xheight allowed by this char
bool adapted); // adapted match or not
BLOB_CHOICE(const BLOB_CHOICE &other);
~BLOB_CHOICE() {}
UNICHAR_ID unichar_id() const {
return unichar_id_;
}
float rating() const {
return rating_;
}
float certainty() const {
return certainty_;
}
inT16 fontinfo_id() const {
return fontinfo_id_;
}
inT16 fontinfo_id2() const {
return fontinfo_id2_;
}
int script_id() const {
return script_id_;
}
void *language_model_state() {
return language_model_state_;
}
inT16 xgap_before() const {
return xgap_before_;
}
inT16 xgap_after() const {
return xgap_after_;
}
inT16 min_xheight() const {
return min_xheight_;
}
inT16 max_xheight() const {
return max_xheight_;
}
bool adapted() const {
return adapted_;
}
void set_unichar_id(UNICHAR_ID newunichar_id) {
unichar_id_ = newunichar_id;
}
void set_rating(float newrat) {
rating_ = newrat;
}
void set_certainty(float newrat) {
certainty_ = newrat;
}
void set_fontinfo_id(inT16 newfont) {
fontinfo_id_ = newfont;
}
void set_fontinfo_id2(inT16 newfont) {
fontinfo_id2_ = newfont;
}
void set_script(int newscript_id) {
script_id_ = newscript_id;
}
void set_language_model_state(void *language_model_state) {
language_model_state_ = language_model_state;
}
void set_xgap_before(inT16 gap) {
xgap_before_ = gap;
}
void set_xgap_after(inT16 gap) {
xgap_after_ = gap;
}
void set_adapted(bool adapted) {
adapted_ = adapted;
}
static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
BLOB_CHOICE* choice = new BLOB_CHOICE;
*choice = *src;
return choice;
}
void print(const UNICHARSET *unicharset) {
tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
(unicharset == NULL) ? "" :
unicharset->debug_str(unichar_id_).string());
}
private:
UNICHAR_ID unichar_id_; // unichar id
inT16 fontinfo_id_; // char font information
inT16 fontinfo_id2_; // 2nd choice font information
float rating_; // size related
float certainty_; // absolute
int script_id_;
// Stores language model information about this BLOB_CHOICE. Used during
// the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are
// recorded in the ratings matrix.
// The pointer is owned/managed by the segmentation search.
void *language_model_state_;
inT16 xgap_before_;
inT16 xgap_after_;
// X-height range (in image pixels) that this classification supports.
inT16 min_xheight_;
inT16 max_xheight_;
bool adapted_; // true if this is a match from adapted templates
};
// Make BLOB_CHOICE listable.
ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
// Permuter codes used in WERD_CHOICEs.
enum PermuterType {
NO_PERM, // 0
PUNC_PERM, // 1
TOP_CHOICE_PERM, // 2
LOWER_CASE_PERM, // 3
UPPER_CASE_PERM, // 4
NGRAM_PERM, // 5
NUMBER_PERM, // 6
USER_PATTERN_PERM, // 7
SYSTEM_DAWG_PERM, // 8
DOC_DAWG_PERM, // 9
USER_DAWG_PERM, // 10
FREQ_DAWG_PERM, // 11
COMPOUND_PERM, // 12
};
class WERD_CHOICE {
public:
static const float kBadRating;
WERD_CHOICE(const UNICHARSET *unicharset)
: unicharset_(unicharset) { this->init(8); }
WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
: unicharset_(unicharset) { this->init(reserved); }
WERD_CHOICE(const char *src_string,
const char *src_lengths,
float src_rating,
float src_certainty,
uinT8 src_permuter,
const UNICHARSET &unicharset)
: unicharset_(&unicharset) {
this->init(src_string, src_lengths, src_rating,
src_certainty, src_permuter);
}
WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
WERD_CHOICE(const WERD_CHOICE &word) : unicharset_(word.unicharset_) {
this->init(word.length());
this->operator=(word);
}
~WERD_CHOICE();
const UNICHARSET *unicharset() const {
return unicharset_;
}
inline int length() const {
return length_;
}
inline const UNICHAR_ID *unichar_ids() const {
return unichar_ids_;
}
inline const UNICHAR_ID unichar_id(int index) const {
assert(index < length_);
return unichar_ids_[index];
}
inline const char *fragment_lengths() const {
return fragment_lengths_;
}
inline const char fragment_length(int index) const {
assert(index < length_);
return fragment_lengths_[index];
}
inline float rating() const {
return rating_;
}
inline float certainty() const {
return certainty_;
}
inline uinT8 permuter() const {
return permuter_;
}
const char *permuter_name() const;
inline bool fragment_mark() const {
return fragment_mark_;
}
inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
return blob_choices_;
}
inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
assert(index < length_);
unichar_ids_[index] = unichar_id;
}
inline void set_fragment_length(char flen, int index) {
assert(index < length_);
fragment_lengths_[index] = flen;
}
inline void set_rating(float new_val) {
rating_ = new_val;
}
inline void set_certainty(float new_val) {
certainty_ = new_val;
}
inline void set_permuter(uinT8 perm) {
permuter_ = perm;
}
inline void set_fragment_mark(bool new_fragment_mark) {
fragment_mark_ = new_fragment_mark;
}
// Note: this function should only be used if all the fields
// are populated manually with set_* functions (rather than
// (copy)constructors and append_* functions).
inline void set_length(int len) {
ASSERT_HOST(reserved_ >= len);
length_ = len;
}
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);
/// Make more space in unichar_id_ and fragment_lengths_ arrays.
inline void double_the_size() {
if (reserved_ > 0) {
unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
reserved_, unichar_ids_);
fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
reserved_, fragment_lengths_);
reserved_ *= 2;
} else {
unichar_ids_ = new UNICHAR_ID[1];
fragment_lengths_ = new char[1];
reserved_ = 1;
}
}
/// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
/// fragment_length_ arrays. Sets other values to default (blank) values.
inline void init(int reserved) {
reserved_ = reserved;
if (reserved > 0) {
unichar_ids_ = new UNICHAR_ID[reserved];
fragment_lengths_ = new char[reserved];
} else {
unichar_ids_ = NULL;
fragment_lengths_ = NULL;
}
length_ = 0;
rating_ = 0.0;
certainty_ = MAX_FLOAT32;
permuter_ = NO_PERM;
fragment_mark_ = false;
blob_choices_ = NULL;
unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
}
/// Helper function to build a WERD_CHOICE from the given string,
/// fragment lengths, rating, certainty and permuter.
/// The function assumes that src_string is not NULL.
/// src_lengths argument could be NULL, in which case the unichars
/// in src_string are assumed to all be of length 1.
void init(const char *src_string, const char *src_lengths,
float src_rating, float src_certainty,
uinT8 src_permuter);
/// Set the fields in this choice to be default (bad) values.
inline void make_bad() {
length_ = 0;
rating_ = kBadRating;
certainty_ = -MAX_FLOAT32;
fragment_mark_ = false;
}
/// This function assumes that there is enough space reserved
/// in the WERD_CHOICE for adding another unichar.
/// This is an efficient alternative to append_unichar_id().
inline void append_unichar_id_space_allocated(
UNICHAR_ID unichar_id, char fragment_length,
float rating, float certainty) {
assert(reserved_ > length_);
length_++;
this->set_unichar_id(unichar_id, fragment_length,
rating, certainty, length_-1);
}
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
float rating, float certainty);
inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
float rating, float certainty, int index) {
assert(index < length_);
unichar_ids_[index] = unichar_id;
fragment_lengths_[index] = fragment_length;
rating_ += rating;
if (certainty < certainty_) {
certainty_ = certainty;
}
}
bool contains_unichar_id(UNICHAR_ID unichar_id) const;
void remove_unichar_ids(int index, int num);
inline void remove_last_unichar_id() { --length_; }
inline void remove_unichar_id(int index) {
this->remove_unichar_ids(index, 1);
}
bool has_rtl_unichar_id() const;
void reverse_and_mirror_unichar_ids();
// Returns the half-open interval of unichar_id indices [start, end) which
// enclose the core portion of this word -- the part after stripping
// punctuation from the left and right.
void punct_stripped(int *start_core, int *end_core) const;
// Return a copy of this WERD_CHOICE with the choices [start, end).
// The result is useful only for checking against a dictionary.
WERD_CHOICE shallow_copy(int start, int end) const;
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const;
const STRING debug_string() const {
STRING word_str;
for (int i = 0; i < length_; ++i) {
word_str += unicharset_->debug_str(unichar_ids_[i]);
word_str += " ";
}
return word_str;
}
// Call this to override the default (strict left to right graphemes)
// with the fact that some engine produces a "reading order" set of
// Graphemes for each word.
bool set_unichars_in_script_order(bool in_script_order) {
return unichars_in_script_order_ = in_script_order;
}
bool unichars_in_script_order() const {
return unichars_in_script_order_;
}
// Returns a UTF-8 string equivalent to the current choice
// of UNICHAR IDs.
const STRING &unichar_string() const {
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
return unichar_string_;
}
// Returns the lengths, one byte each, representing the number of bytes
// required in the unichar_string for each UNICHAR_ID.
const STRING &unichar_lengths() const {
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
return unichar_lengths_;
}
const void print() const { this->print(""); }
const void print(const char *msg) const;
WERD_CHOICE& operator+= ( // concatanate
const WERD_CHOICE & second);// second on first
WERD_CHOICE& operator= (const WERD_CHOICE& source);
private:
const UNICHARSET *unicharset_;
UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word
char *fragment_lengths_; // number of fragments in each unichar
int reserved_; // size of the above arrays
int length_; // word length
float rating_; // size related
float certainty_; // absolute
uinT8 permuter_; // permuter code
bool fragment_mark_; // if true, indicates that this choice
// was chosen over a better one that
// contained a fragment
BLOB_CHOICE_LIST_CLIST *blob_choices_; // best choices for each blob
// Normally, the blob_choices_ represent the recognition results in order
// from left-to-right. However, some engines (say Cube) may return
// recognition results in the order of the script's major reading direction
// (for Arabic, that is right-to-left).
bool unichars_in_script_order_;
// The following variables are populated and passed by reference any
// time unichar_string() or unichar_lengths() are called.
mutable STRING unichar_string_;
mutable STRING unichar_lengths_;
bool unichar_info_present;
private:
void delete_blob_choices();
};
// Make WERD_CHOICE listable.
ELISTIZEH (WERD_CHOICE)
typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;
// Utilities for comparing WERD_CHOICEs
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
const WERD_CHOICE &word2);
// Utilities for debug printing.
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
void print_ratings_list(
const char *msg, // intro message
BLOB_CHOICE_LIST *ratings, // list of results
const UNICHARSET ¤t_unicharset // unicharset that can be used
// for id-to-unichar conversion
);
void print_ratings_info(
FILE *fp, // file to use
BLOB_CHOICE_LIST *ratings, // list of results
const UNICHARSET ¤t_unicharset // unicharset that can be used
// for id-to-unichar conversion
);
void print_char_choices_list(
const char *msg,
const BLOB_CHOICE_LIST_VECTOR &char_choices,
const UNICHARSET ¤t_unicharset,
BOOL8 detailed
);
void print_word_alternates_list(
WERD_CHOICE *word,
GenericVector<WERD_CHOICE *> *alternates);
#endif
|