This file is indexed.

/usr/include/googlepinyin/dictbuilder.h is in libgooglepinyin0-dev 0.1.2-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
#define PINYINIME_INCLUDE_DICTBUILDER_H__

#include <stdlib.h>
#include "./utf16char.h"
#include "./dictdef.h"
#include "./dictlist.h"
#include "./spellingtable.h"
#include "./spellingtrie.h"
#include "./splparser.h"

namespace ime_pinyin {

#ifdef ___BUILD_MODEL___

#define ___DO_STATISTICS___

class DictTrie;

class DictBuilder {
 private:
  // The raw lemma array buffer.
  LemmaEntry *lemma_arr_;
  size_t lemma_num_;

  // Used to store all possible single char items.
  // Two items may have the same Hanzi while their spelling ids are different.
  SingleCharItem *scis_;
  size_t scis_num_;

  // In the tree, root's level is -1.
  // Lemma nodes for root, and level 0
  LmaNodeLE0 *lma_nodes_le0_;

  // Lemma nodes for layers whose levels are deeper than 0
  LmaNodeGE1 *lma_nodes_ge1_;

  // Number of used lemma nodes
  size_t lma_nds_used_num_le0_;
  size_t lma_nds_used_num_ge1_;

  // Used to store homophonies' ids.
  LemmaIdType *homo_idx_buf_;
  // Number of homophonies each of which only contains one Chinese character.
  size_t homo_idx_num_eq1_;
  // Number of homophonies each of which contains more than one character.
  size_t homo_idx_num_gt1_;

  // The items with highest scores.
  LemmaEntry *top_lmas_;
  size_t top_lmas_num_;

  SpellingTable *spl_table_;
  SpellingParser *spl_parser_;

#ifdef ___DO_STATISTICS___
  size_t max_sonbuf_len_[kMaxLemmaSize];
  size_t max_homobuf_len_[kMaxLemmaSize];

  size_t total_son_num_[kMaxLemmaSize];
  size_t total_node_hasson_[kMaxLemmaSize];
  size_t total_sonbuf_num_[kMaxLemmaSize];
  size_t total_sonbuf_allnoson_[kMaxLemmaSize];
  size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
  size_t total_homo_num_[kMaxLemmaSize];

  size_t sonbufs_num1_;     // Number of son buffer with only 1 son
  size_t sonbufs_numgt1_;   // Number of son buffer with more 1 son;

  size_t total_lma_node_num_;

  void stat_init();
  void stat_print();
#endif

 public:

  DictBuilder();
  ~DictBuilder();

  // Build dictionary trie from the file fn_raw. File fn_validhzs provides
  // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
  // included.
  bool build_dict(const char* fn_raw, const char* fn_validhzs,
                  DictTrie *dict_trie);

 private:
  // Fill in the buffer with id. The caller guarantees that the paramters are
  // vaild.
  void id_to_charbuf(unsigned char *buf, LemmaIdType id);

  // Update the offset of sons for a node.
  void set_son_offset(LmaNodeGE1 *node, size_t offset);

  // Update the offset of homophonies' ids for a node.
  void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);

  // Format a speling string.
  void format_spelling_str(char *spl_str);

  // Sort the lemma_arr by the hanzi string, and give each of unique items
  // a id. Why we need to sort the lemma list according to their Hanzi string
  // is to find items started by a given prefix string to do prediction.
  // Actually, the single char items are be in other order, for example,
  // in spelling id order, etc.
  // Return value is next un-allocated idx available.
  LemmaIdType sort_lemmas_by_hz();

  // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
  // lemma buffer lemma_arr_.
  // This function should be called after the lemma array is ready.
  // Return the number of unique SingleCharItem elements.
  size_t build_scis();

  // Construct a subtree using a subset of the spelling array (from
  // item_star to item_end)
  // parent is the parent node to update the necessary information
  // parent can be a member of LmaNodeLE0 or LmaNodeGE1
  bool construct_subset(void* parent, LemmaEntry* lemma_arr,
                        size_t item_start, size_t item_end, size_t level);


  // Read valid Chinese Hanzis from the given file.
  // num is used to return number of chars.
  // The return buffer is sorted and caller needs to free the returned buffer.
  char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);


  // Read a raw dictionary. max_item is the maximum number of items. If there
  // are more items in the ditionary, only the first max_item will be read.
  // Returned value is the number of items successfully read from the file.
  size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
                       size_t max_item);

  // Try to find if a character is in hzs buffer.
  bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);

  // Try to find if all characters in str are in hzs buffer.
  bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
                          const char16 *str, size_t str_len);

  // Get these lemmas with toppest scores.
  void get_top_lemmas();

  // Allocate resource to build dictionary.
  // lma_num is the number of items to be loaded
  bool alloc_resource(size_t lma_num);

  // Free resource.
  void free_resource();
};
#endif  // ___BUILD_MODEL___
}

#endif  // PINYINIME_INCLUDE_DICTBUILDER_H__