/usr/include/irstlm/lmtable.h is in libirstlm-dev 5.80.03-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | // $Id: lmtable.h 3686 2010-10-15 11:55:32Z bertoldi $
/******************************************************************************
IrstLM: IRST Language Model Toolkit
Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
******************************************************************************/
#ifndef MF_LMTABLE_H
#define MF_LMTABLE_H
#ifndef WIN32
#include <sys/types.h>
#include <sys/mman.h>
#endif
#include <math.h>
#include <cstdlib>
#include <string>
#include <set>
#include <limits>
#include "util.h"
#include "ngramcache.h"
#include "dictionary.h"
#include "n_gram.h"
#include "lmContainer.h"
#define MAX(a,b) (((a)>(b))?(a):(b))
#define MIN(a,b) (((a)<(b))?(a):(b))
#define LMTMAXLEV 20
#define MAX_LINE 100000
#ifndef LMTCODESIZE
#define LMTCODESIZE (int)3
#endif
#define SHORTSIZE (int)2
#define PTRSIZE (int)sizeof(char *)
#define INTSIZE (int)4
#define CHARSIZE (int)1
#define PROBSIZE (int)4 //use float
#define QPROBSIZE (int)1 //use qfloat_t
//#define BOUNDSIZE (int)4 //use table_pos_t
#define BOUNDSIZE (int)sizeof(table_entry_pos_t) //use table_pos_t
#define UNIGRAM_RESOLUTION 10000000.0
typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE;
//typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE;
typedef char* node;
typedef enum {LMT_FIND, //!< search: find an entry
LMT_ENTER, //!< search: enter an entry
LMT_INIT, //!< scan: start scan
LMT_CONT //!< scan: continue scan
} LMT_ACTION;
typedef unsigned int table_entry_pos_t; //type for pointing to a full ngram in the table
typedef unsigned long table_pos_t; // type for pointing to a single char in the table
typedef unsigned char qfloat_t; //type for quantized probabilities
//CHECK this part to HERE
#define BOUND_EMPTY1 (numeric_limits<table_entry_pos_t>::max() - 2)
#define BOUND_EMPTY2 (numeric_limits<table_entry_pos_t>::max() - 1)
class lmtable: public lmContainer
{
static const bool debug=true;
void loadtxt(std::istream& inp,const char* header,const char* filename,int mmap);
void loadtxt_ram(std::istream& inp,const char* header);
void loadtxt_mmap(std::istream& inp,const char* header,const char* outfilename);
void loadtxt_level(std::istream& inp,int l);
void loadbin(std::istream& inp,const char* header,const char* filename,int mmap);
void loadbin_header(std::istream& inp, const char* header);
void loadbin_dict(std::istream& inp);
void loadbin_codebook(std::istream& inp,int l);
void loadbin_level(std::istream& inp,int l);
protected:
char* table[LMTMAXLEV+1]; //storage of all levels
LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels
table_entry_pos_t cursize[LMTMAXLEV+1]; //current size of levels
//current offset for in-memory tables (different for each level
//needed to manage partial tables
// mempos = diskpos - offset[level]
table_entry_pos_t tb_offset[LMTMAXLEV+1];
table_entry_pos_t maxsize[LMTMAXLEV+1]; //max size of levels
table_entry_pos_t* startpos[LMTMAXLEV+1]; //support vector to store start positions
char info[100]; //information put in the header
//statistics
int totget[LMTMAXLEV+1];
int totbsearch[LMTMAXLEV+1];
//probability quantization
bool isQtable;
//Incomplete LM table from distributed training
bool isItable;
//Table with reverted n-grams for fast access
bool isInverted;
//Table might contain pruned n-grams
bool isPruned;
int NumCenters[LMTMAXLEV+1];
float* Pcenters[LMTMAXLEV+1];
float* Bcenters[LMTMAXLEV+1];
double logOOVpenalty; //penalty for OOV words (default 0)
int dictionary_upperbound; //set by user
int backoff_state;
//improve access speed
int max_cache_lev;
NGRAMCACHE_t* prob_and_state_cache;
NGRAMCACHE_t* lmtcache[LMTMAXLEV+1];
float ngramcache_load_factor;
float dictionary_load_factor;
//memory map on disk
int memmap; //level from which n-grams are accessed via mmap
int diskid;
off_t tableOffs[LMTMAXLEV+1];
off_t tableGaps[LMTMAXLEV+1];
// is this LM queried for knowing the matching order or (standard
// case) for score?
bool orderQuery;
//flag to enable/disable deletion of dict in the destructor
bool delete_dict;
public:
#ifdef TRACE_CACHELM
std::fstream* cacheout;
int sentence_id;
#endif
dictionary *dict; // dictionary (words - macro tags)
lmtable(float nlf=0.0, float dlfi=0.0);
virtual ~lmtable();
table_entry_pos_t wdprune(float *thr, int aflag=0);
table_entry_pos_t wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double lk=0, double bo=0, double *ts=0, double *tbs=0);
double lprobx(ngram ong, double *lkp=0, double *bop=0, int *bol=0);
table_entry_pos_t ngcnt(table_entry_pos_t *cnt);
table_entry_pos_t ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos);
int pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s);
void init_prob_and_state_cache();
void init_probcache() {
init_prob_and_state_cache();
}; //kept for back compatibility
void init_statecache() {}; //kept for back compatibility
void init_lmtcaches(int uptolev);
void init_caches(int uptolev);
void used_prob_and_state_cache();
void used_lmtcaches();
void used_caches();
void delete_prob_and_state_cache();
void delete_probcache() {
delete_prob_and_state_cache();
}; //kept for back compatibility
void delete_statecache() {}; //kept for back compatibility
void delete_lmtcaches();
void delete_caches();
void check_prob_and_state_cache_levels();
void check_probcache_levels() {
check_prob_and_state_cache_levels();
}; //kept for back compatibility
void check_statecache_levels() {}; //kept for back compatibility
void check_lmtcaches_levels();
void check_caches_levels();
void reset_prob_and_state_cache();
void reset_probcache() {
reset_prob_and_state_cache();
}; //kept for back compatibility
void reset_statecache() {}; //kept for back compatibility
void reset_lmtcaches();
void reset_caches();
bool are_prob_and_state_cache_active();
bool is_probcache_active() {
return are_prob_and_state_cache_active();
}; //kept for back compatibility
bool is_statecache_active() {
return are_prob_and_state_cache_active();
}; //kept for back compatibility
bool are_lmtcaches_active();
bool are_caches_active();
void reset_mmap();
//set the inverted flag to load ngrams in an inverted order
//this choice is disregarded if a binary LM is loaded,
//because the info is stored into the header
bool is_inverted(const bool flag) {
return isInverted=flag;
}
bool is_inverted() {
return isInverted;
}
void configure(int n,bool quantized);
//set penalty for OOV words
double getlogOOVpenalty() const {
return logOOVpenalty;
}
double setlogOOVpenalty(int dub) {
assert(dub > dict->size());
dictionary_upperbound = dub;
return logOOVpenalty=log((double)(dictionary_upperbound - dict->size()))/M_LN10;
}
double setlogOOVpenalty(double oovp) {
return logOOVpenalty=oovp;
}
virtual int maxlevel() const {
return maxlev;
};
bool isQuantized() const {
return isQtable;
}
void savetxt(const char *filename);
void savebin(const char *filename);
void appendbin_level(int level, fstream &out, int mmap);
void appendbin_level_nommap(int level, fstream &out);
void appendbin_level_mmap(int level, fstream &out);
void savebin_level(int level, const char* filename, int mmap);
void savebin_level_nommap(int level, const char* filename);
void savebin_level_mmap(int level, const char* filename);
void savebin_dict(std::fstream& out);
void compact_all_levels(const char* filename);
void compact_single_level(int level, const char* filename);
void concatenate_all_levels(const char* fromfilename, const char* tofilename);
void concatenate_single_level(int level, const char* fromfilename, const char* tofilename);
void remove_all_levels(const char* filename);
void remove_single_level(int level, const char* filename);
void print_table_stat();
void print_table_stat(int level);
void dumplm(std::fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos);
void delete_level(int level, const char* outfilename, int mmap);
void delete_level_nommap(int level);
void delete_level_mmap(int level, const char* filename);
void resize_level(int level, const char* outfilename, int mmap);
void resize_level_nommap(int level);
void resize_level_mmap(int level, const char* filename);
inline void update_offset(int level, table_entry_pos_t value) { tb_offset[level]=value; };
void load(const std::string filename, int mmap=0);
void load(std::istream& inp,const char* filename=NULL,const char* outfilename=NULL,int mmap=0,OUTFILE_TYPE outtype=NONE);
void load_centers(std::istream& inp,int l);
void expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap);
void expand_level_nommap(int level, table_entry_pos_t size);
void expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename);
void cpsublm(lmtable* sublmt, dictionary* subdict,bool keepunigr=true);
int reload(std::set<string> words);
void filter(const char* /* unused parameter: lmfile */) {};
virtual double lprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
virtual double clprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
void *search(int lev,table_entry_pos_t offs,table_entry_pos_t n,int sz,int *w, LMT_ACTION action,char **found=(char **)NULL);
int mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx);
int add(ngram& ng, float prob,float bow);
//template<typename TA, typename TB> int add(ngram& ng, TA prob,TB bow);
int addwithoffset(ngram& ng, float prob,float bow);
// template<typename TA, typename TB> int addwithoffset(ngram& ng, TA prob,TB bow);
void checkbounds(int level);
inline int get(ngram& ng) {
return get(ng,ng.size,ng.size);
}
int get(ngram& ng,int n,int lev);
int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);
virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
inline void putmem(char* ptr,int value,int offs,int size) {
assert(ptr!=NULL);
for (int i=0; i<size; i++)
ptr[offs+i]=(value >> (8 * i)) & 0xff;
};
inline void getmem(char* ptr,int* value,int offs,int size) {
assert(ptr!=NULL);
*value=ptr[offs] & 0xff;
for (int i=1; i<size; i++){
*value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
}
};
template<typename T>
inline void putmem(char* ptr,T value,int offs) {
assert(ptr!=NULL);
memcpy(ptr+offs, &value, sizeof(T));
};
template<typename T>
inline void getmem(char* ptr,T* value,int offs) {
assert(ptr!=NULL);
memcpy((void*)value, ptr+offs, sizeof(T));
};
int nodesize(LMT_TYPE ndt) {
switch (ndt) {
case INTERNAL:
return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE;
case QINTERNAL:
return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE;
case LEAF:
return LMTCODESIZE + PROBSIZE;
case QLEAF:
return LMTCODESIZE + QPROBSIZE;
default:
assert(0);
return 0;
}
}
inline int word(node nd,int value=-1) {
int offset=0;
if (value==-1)
getmem(nd,&value,offset,LMTCODESIZE);
else
putmem(nd,value,offset,LMTCODESIZE);
return value;
};
int codecmp(node a,node b) {
register int i,result;
for (i=(LMTCODESIZE-1); i>=0; i--) {
result=(unsigned char)a[i]-(unsigned char)b[i];
if(result) return result;
}
return 0;
};
int codediff(node a,node b) {
return word(a)-word(b);
};
inline float prob(node nd,LMT_TYPE ndt) {
int offs=LMTCODESIZE;
float fv;
unsigned char cv;
switch (ndt) {
case INTERNAL:
getmem(nd,&fv,offs);
return fv;
case QINTERNAL:
getmem(nd,&cv,offs);
return (float) cv;
case LEAF:
getmem(nd,&fv,offs);
return fv;
case QLEAF:
getmem(nd,&cv,offs);
return (float) cv;
default:
assert(0);
return 0;
}
};
template<typename T>
inline T prob(node nd, LMT_TYPE ndt, T value) {
int offs=LMTCODESIZE;
switch (ndt) {
case INTERNAL:
putmem(nd, value,offs);
break;
case QINTERNAL:
putmem(nd,(unsigned char) value,offs);
break;
case LEAF:
putmem(nd, value,offs);
break;
case QLEAF:
putmem(nd,(unsigned char) value,offs);
break;
default:
assert(0);
return (T) 0;
}
return value;
};
inline float bow(node nd,LMT_TYPE ndt) {
int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
float fv;
unsigned char cv;
switch (ndt) {
case INTERNAL:
getmem(nd,&fv,offs);
return fv;
case QINTERNAL:
getmem(nd,&cv,offs);
return (float) cv;
case LEAF:
getmem(nd,&fv,offs);
return fv;
case QLEAF:
getmem(nd,&cv,offs);
return (float) cv;
default:
assert(0);
return 0;
}
};
template<typename T>
inline T bow(node nd,LMT_TYPE ndt, T value) {
int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
switch (ndt) {
case INTERNAL:
putmem(nd, value,offs);
break;
case QINTERNAL:
putmem(nd,(unsigned char) value,offs);
break;
case LEAF:
putmem(nd, value,offs);
break;
case QLEAF:
putmem(nd,(unsigned char) value,offs);
break;
default:
assert(0);
return 0;
}
return value;
};
inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level){ return bound(nd,ndt) - tb_offset[level+1]; }
inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level){ return bound(nd, ndt, value + tb_offset[level+1]); }
// table_entry_pos_t bound(node nd,LMT_TYPE ndt, int level=0) {
table_entry_pos_t bound(node nd,LMT_TYPE ndt) {
int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
table_entry_pos_t value;
getmem(nd,&value,offs);
// value -= tb_offset[level+1];
return value;
};
// table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level=0) {
table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value) {
int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
// value += tb_offset[level+1];
putmem(nd,value,offs);
return value;
};
//template<typename T> T boundwithoffset(node nd,LMT_TYPE ndt, T value, int level);
/*
table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level) {
int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
table_entry_pos_t value;
getmem(nd,&value,offs);
return value;
// return value-tb_offset[level+1];
};
*/
/*
table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level) {
int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
putmem(nd,value,offs);
return value;
// return value+tb_offset[level+1];
};
*/
/*
inline table_entry_pos_t bound(node nd,LMT_TYPE ndt) {
int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
table_entry_pos_t value;
getmem(nd,&value,offs);
return value;
};
template<typename T>
inline T bound(node nd,LMT_TYPE ndt, T value) {
int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
putmem(nd,value,offs);
return value;
};
*/
//returns the indexes of the successors of a node
int succrange(node ndp,int level,table_entry_pos_t* isucc=NULL,table_entry_pos_t* esucc=NULL);
void stat(int lev=0);
void printTable(int level);
virtual inline void setDict(dictionary* d) {
if (delete_dict==true && dict) delete dict;
dict=d;
delete_dict=false;
};
virtual inline dictionary* getDict() const {
return dict;
};
inline table_entry_pos_t getCurrentSize(int l) const {
return cursize[l];
};
inline void setOrderQuery(bool v) {
orderQuery = v;
}
inline bool isOrderQuery() const {
return orderQuery;
}
inline float GetNgramcacheLoadFactor() {
return ngramcache_load_factor;
}
inline float GetDictioanryLoadFactor() {
return ngramcache_load_factor;
}
//never allow the increment of the dictionary through this function
inline virtual void dictionary_incflag(const bool flag) {
UNUSED(flag);
};
inline virtual bool filter(const string sfilter, lmtable* sublmt, const string skeepunigrams) {
std::cerr << "filtering... \n";
dictionary *dict=new dictionary((char *)sfilter.c_str());
cpsublm(sublmt, dict,(skeepunigrams=="yes"));
delete dict;
std::cerr << "...done\n";
return true;
}
inline virtual bool is_OOV(int code) {
return (code == dict->oovcode());
};
};
#endif
|