/usr/include/tesseract/shapetable.h is in libtesseract-dev 3.02.01-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | // Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: shapetable.h
// Description: Class to map a classifier shape index to unicharset
// indices and font indices.
// Author: Ray Smith
// Created: Thu Oct 28 17:46:32 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
#define TESSERACT_CLASSIFY_SHAPETABLE_H_
#include "genericvector.h"
#include "intmatcher.h"
class STRING;
class UNICHARSET;
namespace tesseract {
// Simple struct to hold a set of fonts associated with a single unichar-id.
// A vector of UnicharAndFonts makes a shape.
struct UnicharAndFonts {
UnicharAndFonts() : unichar_id(0) {
}
UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
font_ids.push_back(font_id);
}
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp);
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
static int SortByUnicharId(const void* v1, const void* v2);
GenericVector<int> font_ids;
int unichar_id;
};
// A Shape is a collection of unichar-ids and a list of fonts associated with
// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
// a classifiable unit, and represents a group of characters or parts of
// characters that have a similar or identical shape. Shapes/ShapeTables may
// be organized hierarchically from identical shapes at the leaves to vaguely
// similar shapes near the root.
class Shape {
public:
Shape() : destination_index_(-1) {}
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp);
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
int destination_index() const {
return destination_index_;
}
void set_destination_index(int index) {
destination_index_ = index;
}
int size() const {
return unichars_.size();
}
// Returns a UnicharAndFonts entry for the given index, which must be
// in the range [0, size()).
const UnicharAndFonts& operator[](int index) const {
return unichars_[index];
}
// Adds a font_id for the given unichar_id. If the unichar_id is not
// in the shape, it is added.
void AddToShape(int unichar_id, int font_id);
// Adds everything in other to this.
void AddShape(const Shape& other);
// Returns true if the shape contains the given unichar_id, font_id pair.
bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
// Returns true if the shape contains the given unichar_id, ignoring font.
bool ContainsUnichar(int unichar_id) const;
// Returns true if the shape contains the given font, ignoring unichar_id.
bool ContainsFont(int font_id) const;
// Returns true if this is a subset (including equal) of other.
bool IsSubsetOf(const Shape& other) const;
// Returns true if the lists of unichar ids are the same in this and other,
// ignoring fonts.
// NOT const, as it will sort the unichars on demand.
bool IsEqualUnichars(Shape* other);
private:
// Sorts the unichars_ vector by unichar.
void SortUnichars();
// Flag indicates that the unichars are sorted, allowing faster set
// operations with another shape.
bool unichars_sorted_;
// If this Shape is part of a ShapeTable the destiation_index_ is the index
// of some other shape in the ShapeTable with which this shape is merged.
int destination_index_;
// Array of unichars, each with a set of fonts. Each unichar has at most
// one entry in the vector.
GenericVector<UnicharAndFonts> unichars_;
};
// ShapeTable is a class to encapsulate the triple indirection that is
// used here.
// ShapeTable is a vector of shapes.
// Each shape is a vector of UnicharAndFonts representing the set of unichars
// that the shape represents.
// Each UnicharAndFonts also lists the fonts of the unichar_id that were
// mapped to the shape during training.
class ShapeTable {
public:
ShapeTable();
// The UNICHARSET reference supplied here, or in set_unicharset below must
// exist for the entire life of the ShapeTable. It is used only by DebugStr.
explicit ShapeTable(const UNICHARSET& unicharset);
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Accessors.
int NumShapes() const {
return shape_table_.size();
}
const UNICHARSET& unicharset() const {
return *unicharset_;
}
// Shapetable takes a pointer to the UNICHARSET, so it must persist for the
// entire life of the ShapeTable.
void set_unicharset(const UNICHARSET& unicharset) {
unicharset_ = &unicharset;
}
// Returns a string listing the classes/fonts in a shape.
STRING DebugStr(int shape_id) const;
// Returns a debug string summarizing the table.
STRING SummaryStr() const;
// Adds a new shape starting with the given unichar_id and font_id.
// Returns the assigned index.
int AddShape(int unichar_id, int font_id);
// Adds a copy of the given shape.
// Returns the assigned index.
int AddShape(const Shape& other);
// Removes the shape given by the shape index. All indices above are changed!
void DeleteShape(int shape_id);
// Adds a font_id to the given existing shape index for the given
// unichar_id. If the unichar_id is not in the shape, it is added.
void AddToShape(int shape_id, int unichar_id, int font_id);
// Adds the given shape to the existing shape with the given index.
void AddShapeToShape(int shape_id, const Shape& other);
// Returns the id of the shape that contains the given unichar and font.
// If not found, returns -1.
// If font_id < 0, the font_id is ignored and the first shape that matches
// the unichar_id is returned.
int FindShape(int unichar_id, int font_id) const;
// Returns the first unichar_id and font_id in the given shape.
void GetFirstUnicharAndFont(int shape_id,
int* unichar_id, int* font_id) const;
// Accessors for the Shape with the given shape_id.
const Shape& GetShape(int shape_id) const {
return *shape_table_[shape_id];
}
Shape* MutableShape(int shape_id) {
return shape_table_[shape_id];
}
// Expands all the classes/fonts in the shape individually to build
// a ShapeTable.
int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
// Returns true if the shapes are already merged.
bool AlreadyMerged(int shape_id1, int shape_id2);
// Returns true if any shape contains multiple unichars.
bool AnyMultipleUnichars();
// Returns the maximum number of unichars over all shapes.
int MaxNumUnichars() const;
// Merges shapes with a common unichar over the [start, end) interval.
// Assumes single unichar per shape.
void ForceFontMerges(int start, int end);
// Returns the number of unichars in the master shape.
int MasterUnicharCount(int shape_id) const;
// Returns the sum of the font counts in the master shape.
int MasterFontCount(int shape_id) const;
// Returns the number of unichars that would result from merging the shapes.
int MergedUnicharCount(int shape_id1, int shape_id2) const;
// Merges two shape_ids, leaving shape_id2 marked as merged.
void MergeShapes(int shape_id1, int shape_id2);
// Appends the master shapes from other to this.
// Used to create a clean ShapeTable from a merged one, or to create a
// copy of a ShapeTable.
void AppendMasterShapes(const ShapeTable& other);
// Returns the number of master shapes remaining after merging.
int NumMasterShapes() const;
// Returns the destination of this shape, (if merged), taking into account
// the fact that the destination may itself have been merged.
// For a non-merged shape, returns the input shape_id.
int MasterDestinationIndex(int shape_id) const;
private:
// Pointer to a provided unicharset used only by the Debugstr member.
const UNICHARSET* unicharset_;
// Vector of pointers to the Shapes in this ShapeTable.
PointerVector<Shape> shape_table_;
};
} // namespace tesseract.
#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_
|