This file is indexed.

/usr/include/ngram/ngram-output.h is in libngram-dev 1.3.2-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2016 Brian Roark and Google, Inc.
// NGram model class for outputting a model or outputting perplexity of text.

#ifndef NGRAM_NGRAM_OUTPUT_H_
#define NGRAM_NGRAM_OUTPUT_H_

#include <ostream>
#include <sstream>
#include <string>
#include <fst/compose.h>

#include <ngram/ngram-context.h>
#include <ngram/ngram-mutable-model.h>
#include <ngram/util.h>

namespace ngram {

using std::ostringstream;

using fst::StdFst;
using fst::ComposeFst;
using fst::ComposeFstOptions;
using fst::CacheOptions;

using fst::MATCHER_REWRITE_NEVER;
using fst::PhiMatcher;

static const int kSpecialLabel = -2;

class NGramOutput : public NGramMutableModel<StdArc> {
 public:
  typedef StdArc::StateId StateId;
  typedef StdArc::Label Label;
  typedef StdArc::Weight Weight;

  // Construct an NGramModel object, consisting of the fst and some
  // information about the states under the assumption that the fst is a model
  explicit NGramOutput(StdMutableFst *infst, std::ostream &ostrm = std::cout,
                       Label backoff_label = 0, bool check_consistency = false,
                       const string &context_pattern = "",
                       bool include_all_suffixes = false)
      : NGramMutableModel<StdArc>(infst, backoff_label, kNormEps,
                                  !context_pattern.empty()),
        ostrm_(ostrm),
        include_all_suffixes_(include_all_suffixes),
        context_(context_pattern, HiOrder()) {
    if (!GetFst().InputSymbols()) {
      NGRAMERROR() << "NGramOutput: no symbol tables provided";
      NGramModel<StdArc>::SetError();
    }
  }

  enum ShowBackoff {
    EPSILON,  // Show backoff Weights as explicit epsilon transitions.
    INLINE,  // Show backoff Weights in a third column when present.
    NONE,
  };

  // Print the N-gram model: each n-gram is on a line with its weight
  void ShowNGramModel(ShowBackoff showeps, bool neglogs, bool intcnts,
                      bool ARPA) const;

  // Use n-gram model to calculate perplexity of input strings.
  bool PerplexityNGramModel(
      const std::vector<std::unique_ptr<fst::StdVectorFst>> &infsts,
      int32 v, bool phimatch, string *OOV_symbol, double OOV_class_size,
      double OOV_probability);

  // Extract random samples from model and output
  void SampleStringsFromModel(int64 samples, bool show_backoff) {
    DeBackoffNGramModel();                  // Convert from backoff
    if (Error()) return;
    RandNGramModel(samples, show_backoff);  // randgen from resulting model
  }

  typedef PhiMatcher<Matcher<Fst<StdArc> > > NGPhiMatcher;

  ComposeFst<StdArc> *FailLMCompose(const StdMutableFst &infst,
                                    Label special_label) const {
    ComposeFst<StdArc> *cfst = new ComposeFst<StdArc>(
        infst, GetFst(),
        ComposeFstOptions<StdArc, NGPhiMatcher>(
            CacheOptions(), new NGPhiMatcher(infst, MATCH_NONE, kNoLabel),
            new NGPhiMatcher(GetFst(), MATCH_INPUT, special_label, 1,
                             MATCHER_REWRITE_NEVER)));
    return cfst;
  }

  void FailLMCompose(const StdMutableFst &infst, StdMutableFst *ofst,
                     Label special_label) const {
    *ofst = ComposeFst<StdArc>(
        infst, GetFst(),
        ComposeFstOptions<StdArc, NGPhiMatcher>(
            CacheOptions(), new NGPhiMatcher(infst, MATCH_NONE, kNoLabel),
            new NGPhiMatcher(GetFst(), MATCH_INPUT, special_label, 1,
                             MATCHER_REWRITE_NEVER)));
  }

  // Switch backoff label to special label for phi matcher
  // assumed to be order preserving (as it is with <epsilon> and -2)
  void MakePhiMatcherLM(Label special_label);

  // Apply n-gram model to fst.  For now, assumes linear fst, accumulates stats
  double ApplyNGramToFst(const fst::StdVectorFst &input_fst,
                         const Fst<StdArc> &symbolfst, bool phimatch,
                         bool verbose, Label special_label, Label OOV_label,
                         double OOV_cost, double *logprob, int *words,
                         int *oovs, int *words_skipped);

  // Adds a phi loop (rho) at unigram state for OOVs
  // OOV_class_size (N) and OOV_probability (p) determine weight of loop: p/N
  // Rest of unigrams renormalized accordingly, by 1-p
  void RenormUnigramForOOV(Label special_label, Label OOV_label,
                           double OOV_class_size, double OOV_probability);

  // Checks to see if a state or ngram is in context
  bool InContext(StateId st) const;
  bool InContext(const std::vector<Label> &ngram) const;

 private:
  // Convert to a new log base for printing (ARPA)
  double ShowLogNewBase(double neglogcost, double base) const {
    return -neglogcost / log(base);
  }

  // Print the header portion of the ARPA model format
  void ShowARPAHeader() const;

  // Print n-grams leaving a particular state for the ARPA model format
  void ShowARPANGrams(StdArc::StateId st, const string &str, int order) const;

  // Print the N-gram model in ARPA format
  void ShowARPAModel() const;

  // Print n-grams leaving a particular state, standard output format
  void ShowNGrams(StdArc::StateId st, const string &str, ShowBackoff showeps,
                  bool neglogs, bool intcnts) const;

  void ShowStringFst(const Fst<StdArc> &infst) const;

  void RelabelAndSetSymbols(StdMutableFst *infst, const Fst<StdArc> &symbolfst);

  void ShowPhiPerplexity(const ComposeFst<StdArc> &cfst, bool verbose,
                         int special_label, Label OOV_label, double *logprob,
                         int *words, int *oovs, int *words_skipped) const;

  void ShowNonPhiPerplexity(const Fst<StdArc> &infst, bool verbose,
                            double OOV_cost, Label OOV_label, double *logprob,
                            int *words, int *oovs, int *words_skipped) const;

  void FindNextStateInModel(StateId *mst, Label label, double OOV_cost,
                            Label OOV_label, double *neglogprob, int *word_cnt,
                            int *oov_cnt, int *words_skipped, string *history,
                            bool verbose, std::vector<Label> *ngram) const;

  // add symbol to n-gram history string
  void AppendWordToNGramHistory(string *str, const string &symbol) const {
    if (str->size() > 0) (*str) += ' ';
    (*str) += symbol;
  }

  // Calculate and show (if verbose) </s> n-gram, and accumulate stats
  void ApplyFinalCost(StateId mst, string history, int word_cnt, int oov_cnt,
                      int skipped, double neglogprob, double *logprob,
                      int *words, int *oovs, int *words_skipped, bool verbose,
                      const std::vector<Label> &ngram) const;

  // Header for verbose n-gram entries
  void ShowNGramProbHeader() const {
    ostrm_ << "                                                ";
    ostrm_ << "ngram  -logprob\n";
    ostrm_ << "        N-gram probability                      ";
    ostrm_ << "found  (base10)\n";
  }

  // Show the verbose n-gram entries with history order and neglogprob
  void ShowNGramProb(string symbol, string history, bool oov, int order,
                     double ngram_cost) const;

  // Show summary perplexity numbers, similar to summary given by SRILM
  void ShowPerplexity(size_t sentences, int word_cnt, int oov_cnt,
                      int words_skipped, double logprob) const {
    ostrm_ << sentences << " sentences, ";
    ostrm_ << word_cnt << " words, ";
    ostrm_ << oov_cnt << " OOVs\n";
    if (words_skipped > 0) {
      ostrm_ << "NOTE: " << words_skipped << " OOVs with no probability"
             << " were skipped in perplexity calculation\n";
      word_cnt -= words_skipped;
    }
    ostrm_ << "logprob(base 10)= " << logprob;
    ostrm_ << ";  perplexity = ";
    ostrm_ << pow(10, -logprob / (word_cnt + sentences)) << "\n\n";
  }

  // Calculate prob of </s> and add to accum'd prob, and update total prob
  double SetInitRandProb(StateId hi_state, StateId st, double *r) const;

  // Show symbol during random string generation
  StateId ShowRandSymbol(Label lbl, bool *first_printed, bool show_backoff,
                         StateId st) const;

  // Find random symbol and show if necessary
  StateId GetAndShowSymbol(StateId st, double p, double r, StateId *hi_state,
                           bool *first_printed, bool show_backoff) const;

  // Produce and output random samples from model using rand/srand
  void RandNGramModel(int64 samples, bool show_backoff) const;

  // Checks parameterization of perplexity calculation and sets OOV_label
  bool GetOOVLabel(double *OOV_probability, string *OOV_symbol,
                   StdArc::Label *OOV_label);

 private:
  std::ostream &ostrm_;
  bool include_all_suffixes_;
  NGramContext context_;
};

}  // namespace ngram

#endif  // NGRAM_NGRAM_OUTPUT_H_