/usr/include/sdsl/wt_helper.hpp is in libsdsl-dev 2.0.3-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 | #ifndef INCLUDED_SDSL_WT_HELPER
#define INCLUDED_SDSL_WT_HELPER
#include "int_vector.hpp"
#include <algorithm>
#include <limits>
#include <deque>
#include <queue>
#include <vector>
#include <utility>
namespace sdsl
{
typedef std::pair<int_vector<>::size_type, int_vector<>::size_type> range_type;
typedef std::vector<range_type> range_vec_type;
//! Empty range check
/*! \param r Range to check
* \returns True if the range is empty, false otherwise.
*/
bool empty(const range_type& r);
//! Size of a range
/*! \param r Range to check
* \returns True if the range is empty, false otherwise.
*/
int_vector<>::size_type size(const range_type& r);
//! Count for each character the number of occurrences in rac[0..size-1]
/*!
* \param C An array of size 256, which contains for each character the number of occurrences in rac[0..size-1]
*/
template<class t_file_buffer,class t_rac>
void calculate_character_occurences(t_file_buffer& text, const int_vector_size_type size, t_rac& C)
{
C = t_rac();
if (text.size() < size) {
throw std::logic_error("calculate_character_occurrences: stream size is smaller than size!");
return;
}
for (int_vector_size_type i=0; i < size; ++i) {
uint64_t c = text[i];
if (c >= C.size()) { C.resize(c+1, 0); }
++C[c];
}
}
template<class t_rac, class sigma_type>
void calculate_effective_alphabet_size(const t_rac& C, sigma_type& sigma)
{
sigma = std::count_if(begin(C),end(C),[](decltype(*begin(C)) &x) {
return x > 0;
});
}
struct pc_node {
uint64_t freq; // frequency of symbol sym
uint64_t sym; // symbol
uint64_t parent; // pointer to the parent
uint64_t child[2]; // pointer to the children
enum :uint64_t {undef = 0xFFFFFFFFFFFFFFFFULL}; // max uint64_t value
pc_node(uint64_t freq=0, uint64_t sym=0, uint64_t parent=undef,
uint64_t child_left=undef, uint64_t child_right=undef);
pc_node& operator=(const pc_node& v);
};
template<class t_tree_strat_fat>
struct _node {
using node_type = typename t_tree_strat_fat::node_type;
typedef uint64_t size_type;
uint64_t bv_pos = 0; // pointer into the bit_vector, which represents the wavelet tree
uint64_t bv_pos_rank = 0; // pre-calculated rank for the prefix up to but not including bv_pos
node_type parent = t_tree_strat_fat::undef; // pointer to the parent
node_type child[2] = {t_tree_strat_fat::undef,t_tree_strat_fat::undef}; // pointer to the children
_node(uint64_t bv_pos=0, uint64_t bv_pos_rank=0, node_type parent=t_tree_strat_fat::undef,
node_type child_left=t_tree_strat_fat::undef, node_type child_right=t_tree_strat_fat::undef):
bv_pos(bv_pos), bv_pos_rank(bv_pos_rank), parent(parent) {
child[0] = child_left;
child[1] = child_right;
}
_node& operator=(const _node& v) {
if (this != &v) {
bv_pos = v.bv_pos;
bv_pos_rank = v.bv_pos_rank;
parent = v.parent;
child[0] = v.child[0];
child[1] = v.child[1];
}
return *this;
}
_node& operator=(const pc_node& v) {
bv_pos = v.freq;
bv_pos_rank = v.sym;
parent = v.parent;
child[0] = v.child[0];
child[1] = v.child[1];
return *this;
}
size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const {
structure_tree_node* st_child = structure_tree::add_child(v, name, util::class_name(*this));
uint64_t written_bytes = 0;
written_bytes += write_member(bv_pos, out);
written_bytes += write_member(bv_pos_rank, out);
written_bytes += write_member(parent, out);
out.write((char*)child, 2*sizeof(child[0]));
written_bytes += 2*sizeof(child[0]);
structure_tree::add_size(st_child, written_bytes);
return written_bytes;
}
void load(std::istream& in) {
read_member(bv_pos, in);
read_member(bv_pos_rank, in);
read_member(parent, in);
in.read((char*) child, 2*sizeof(child[0]));
}
};
// TODO: version of _byte_tree for lex_ordered tree shapes
// m_c_to_leaf can be compressed and
// m_path is only needed for sigma chars
// Strategy class for tree representation of a WT
template<bool t_dfs_shape, class t_wt>
struct _byte_tree {
using alphabet_category = byte_alphabet_tag;
using value_type = uint8_t;
using node_type = uint16_t; // node is represented by index in m_nodes
using data_node = _node<_byte_tree>;
enum :uint16_t {undef = 0xFFFF}; // max uint16_t value
enum :uint32_t {fixed_sigma = 256};
enum :uint8_t {int_width = 8}; // width of the input integers
std::vector<data_node> m_nodes; // nodes for the prefix code tree structure
node_type m_c_to_leaf[fixed_sigma]; // map symbol c to a leaf in the tree structure
// if m_c_to_leaf[c] == undef the char does
// not exists in the text
uint64_t m_path[fixed_sigma]; // path information for each char; the bits at position
// 0..55 hold path information; bits 56..63 the length
// of the path in binary representation
void copy(const _byte_tree& bt) {
m_nodes = bt.m_nodes;
for (uint32_t i=0; i<fixed_sigma; ++i)
m_c_to_leaf[i] = bt.m_c_to_leaf[i];
for (uint32_t i=0; i<fixed_sigma; ++i)
m_path[i] = bt.m_path[i];
}
_byte_tree() {}
_byte_tree(const std::vector<pc_node>& temp_nodes, uint64_t& bv_size, const t_wt*) {
m_nodes.resize(temp_nodes.size());
m_nodes[0] = temp_nodes.back(); // insert root at index 0
bv_size = 0;
size_t node_cnt = 1;
node_type last_parent = undef;
std::deque<node_type> q;
q.push_back(0);
while (!q.empty()) {
node_type idx;
if (!t_dfs_shape) {
idx = q.front(); q.pop_front();
} else {
idx = q.back(); q.pop_back();
}
// frq_sum is store in bv_pos value
uint64_t frq = m_nodes[idx].bv_pos;
m_nodes[idx].bv_pos = bv_size;
if (m_nodes[idx].child[0] != undef) // if node is not a leaf
bv_size += frq; // add frequency
if (idx > 0) { // node is not the root
if (last_parent != m_nodes[idx].parent)
m_nodes[m_nodes[idx].parent].child[0] = idx;
else
m_nodes[m_nodes[idx].parent].child[1] = idx;
last_parent = m_nodes[idx].parent;
}
if (m_nodes[idx].child[0] != undef) { // if node is not a leaf
for (uint32_t k=0; k<2; ++k) { // add children to tree
m_nodes[node_cnt] = temp_nodes[ m_nodes[idx].child[k] ];
m_nodes[node_cnt].parent = idx;
q.push_back(node_cnt);
m_nodes[idx].child[k] = node_cnt++;
}
}
}
// initialize m_c_to_leaf
for (uint32_t i=0; i<fixed_sigma; ++i)
m_c_to_leaf[i] = undef; // if c is not in the alphabet m_c_to_leaf[c] = undef
for (node_type v=0; v < m_nodes.size(); ++v) {
if (m_nodes[v].child[0] == undef) // if node is a leaf
m_c_to_leaf[(uint8_t)m_nodes[v].bv_pos_rank] = v; // calculate value
}
// initialize path information
// Note: In the case of a bfs search order,
// we can classify nodes as right child and left child with an easy criterion:
// node is a left child, if node%2==1
// node is a right child, if node%2==0
for (uint32_t c=0, prev_c=0; c<fixed_sigma; ++c) {
if (m_c_to_leaf[c] != undef) { // if char exists in the alphabet
node_type v = m_c_to_leaf[c];
uint64_t pw = 0; // path
uint64_t pl = 0; // path len
while (v != root()) { // while node is not the root
pw <<= 1;
if (m_nodes[m_nodes[v].parent].child[1] == v) // if the node is a right child
pw |= 1ULL;
++pl;
v = m_nodes[v].parent; // go up the tree
}
if (pl > 56) {
throw std::logic_error("Code depth greater than 56!!!");
}
m_path[c] = pw | (pl << 56);
prev_c = c;
} else {
uint64_t pl = 0; // len is 0, good for special case in rank
m_path[c] = prev_c | (pl << 56);
}
}
}
template<class t_rank_type>
void init_node_ranks(const t_rank_type& rank) {
for (uint64_t i=0; i<m_nodes.size(); ++i) {
if (m_nodes[i].child[0] != undef) // if node is not a leaf
m_nodes[i].bv_pos_rank = rank.rank(m_nodes[i].bv_pos);
}
}
_byte_tree(const _byte_tree& bt) {
copy(bt);
}
void swap(_byte_tree& bt) {
std::swap(m_nodes, bt.m_nodes);
for (uint32_t i=0; i<fixed_sigma; ++i) {
std::swap(m_c_to_leaf[i], bt.m_c_to_leaf[i]);
std::swap(m_path[i], bt.m_path[i]);
}
}
_byte_tree& operator=(const _byte_tree& bt) {
if (this != &bt) {
copy(bt);
}
return *this;
}
//! Serializes the data structure into the given ostream
uint64_t serialize(std::ostream& out, structure_tree_node* v=nullptr,
std::string name="") const {
structure_tree_node* child = structure_tree::add_child(
v, name, util::class_name(*this));
uint64_t written_bytes = 0;
uint64_t m_nodes_size = m_nodes.size();
write_member(m_nodes_size, out, child, "m_nodes.size()");
serialize_vector(m_nodes, out, child, "m_nodes");
out.write((char*) m_c_to_leaf, fixed_sigma*sizeof(m_c_to_leaf[0]));
written_bytes += fixed_sigma*sizeof(m_c_to_leaf[0]);// bytes from previous loop
out.write((char*) m_path, fixed_sigma*sizeof(m_path[0]));
written_bytes += fixed_sigma*sizeof(m_path[0]);// bytes from previous loop
structure_tree::add_size(child, written_bytes);
return written_bytes;
}
//! Loads the data structure from the given istream.
void load(std::istream& in) {
uint64_t m_nodes_size = 0;
read_member(m_nodes_size, in);
m_nodes = std::vector<data_node>(m_nodes_size);
load_vector(m_nodes, in);
in.read((char*) m_c_to_leaf, fixed_sigma*sizeof(m_c_to_leaf[0]));
in.read((char*) m_path, fixed_sigma*sizeof(m_path[0]));
}
//! Get corresponding leaf for symbol c.
inline node_type c_to_leaf(value_type c)const {
return m_c_to_leaf[c];
}
//! Return the root node of the tree.
inline static node_type root() {
return 0;
}
//! Return the number of nodes in the tree.
uint64_t size() const {
return m_nodes.size();
}
//! Return the parent node of v.
inline node_type parent(node_type v)const {
return m_nodes[v].parent;
}
//! Return left (i=0) or right (i=1) child node of v.
inline node_type child(node_type v, uint8_t i)const {
return m_nodes[v].child[i];
}
//! Return if v is a leaf node.
inline bool is_leaf(node_type v)const {
return m_nodes[v].child[0] == undef;
}
//! Return the path as left/right bit sequence in a uint64_t
inline uint64_t bit_path(value_type c)const {
return m_path[c];
}
//! Return the start of the node in the WT's bit vector
inline uint64_t bv_pos(node_type v)const {
return m_nodes[v].bv_pos;
}
//! Returns for node v the rank of 1's up to bv_pos(v)
inline uint64_t bv_pos_rank(node_type v)const {
return m_nodes[v].bv_pos_rank;
}
//! Return if the node is a valid node
inline bool is_valid(node_type v)const {
return v != undef;
}
//! Return symbol c or the next larger symbol in the wt
inline std::pair<bool,value_type> symbol_gte(value_type c) const
{
for(uint32_t i=c;i<fixed_sigma;i++) {
if(m_c_to_leaf[i]!=undef) {
return {true,i};
}
}
return {false,0};
}
//! Return symbol c or the next smaller symbol in the wt
inline std::pair<bool,value_type> symbol_lte(value_type c) const
{
for(uint32_t i=c;i>0;i--) {
if(m_c_to_leaf[i]!=undef) {
return {true,i};
}
}
if(m_c_to_leaf[0]!=undef)
return {true,0};
return {false,0};
}
};
// Strategy class for tree representation of a WT
template<bool t_dfs_shape=false>
struct byte_tree {
template<class t_wt>
using type = _byte_tree<t_dfs_shape, t_wt>;
};
// Strategy class for tree representation of a WT
template<bool t_dfs_shape, class t_wt>
struct _int_tree {
using alphabet_category = int_alphabet_tag;
using value_type = uint64_t;
using node_type = uint64_t; // node is represented by index in m_nodes
using data_node = _node<_int_tree>;
enum :uint64_t {undef = 0xFFFFFFFFFFFFFFFFULL}; // max uint64_t value
enum :uint8_t {int_width = 0}; // width of the input integers is variable
std::vector<data_node> m_nodes; // nodes for the prefix code tree structure
std::vector<node_type> m_c_to_leaf; // map symbol c to a leaf in the tree structure
// if m_c_to_leaf[c] == undef the char does
// not exists in the text
std::vector<uint64_t> m_path; // path information for each char; the bits at position
// 0..55 hold path information; bits 56..63 the length
// of the path in binary representation
void copy(const _int_tree& bt) {
m_nodes = bt.m_nodes;
m_c_to_leaf = bt.m_c_to_leaf;
m_path = bt.m_path;
}
_int_tree() {}
_int_tree(const std::vector<pc_node>& temp_nodes, uint64_t& bv_size, const t_wt*) {
m_nodes.resize(temp_nodes.size());
m_nodes[0] = temp_nodes.back(); // insert root at index 0
bv_size = 0;
size_t node_cnt = 1;
node_type last_parent = undef;
std::deque<node_type> q;
q.push_back(0);
uint64_t max_c = 0;
while (!q.empty()) {
node_type idx;
if (!t_dfs_shape) {
idx = q.front(); q.pop_front();
} else {
idx = q.back(); q.pop_back();
}
// frq_sum is store in bv_pos value
uint64_t frq = m_nodes[idx].bv_pos;
m_nodes[idx].bv_pos = bv_size;
if (m_nodes[idx].child[0] != undef) { // if node is not a leaf
bv_size += frq; // add frequency
} else if (max_c < m_nodes[idx].bv_pos_rank) { // node is leaf and contains large symbol
max_c = m_nodes[idx].bv_pos_rank;
}
if (idx > 0) { // node is not the root
if (last_parent != m_nodes[idx].parent)
m_nodes[m_nodes[idx].parent].child[0] = idx;
else
m_nodes[m_nodes[idx].parent].child[1] = idx;
last_parent = m_nodes[idx].parent;
}
if (m_nodes[idx].child[0] != undef) { // if node is not a leaf
for (uint32_t k=0; k<2; ++k) { // add children to tree
m_nodes[node_cnt] = temp_nodes[ m_nodes[idx].child[k] ];
m_nodes[node_cnt].parent = idx;
q.push_back(node_cnt);
m_nodes[idx].child[k] = node_cnt++;
}
}
}
// initialize m_c_to_leaf
// if c is not in the alphabet m_c_to_leaf[c] = undef
m_c_to_leaf.resize(max_c+1, undef);
for (node_type v=0; v < m_nodes.size(); ++v) {
if (m_nodes[v].child[0] == undef) { // if node is a leaf
uint64_t c = m_nodes[v].bv_pos_rank;
m_c_to_leaf[c] = v; // calculate value
if (c > max_c) max_c = c;
}
}
m_path = std::vector<uint64_t>(m_c_to_leaf.size(), 0);
// initialize path information
// Note: In the case of a bfs search order,
// we can classify nodes as right child and left child with an easy criterion:
// node is a left child, if node%2==1
// node is a right child, if node%2==0
for (value_type c=0, prev_c=0; c < m_c_to_leaf.size(); ++c) {
if (m_c_to_leaf[c] != undef) { // if char exists in the alphabet
node_type v = m_c_to_leaf[c];
uint64_t w = 0; // path
uint64_t l = 0; // path len
while (v != root()) { // while node is not the root
w <<= 1;
if (m_nodes[m_nodes[v].parent].child[1] == v) // if the node is a right child
w |= 1ULL;
++l;
v = m_nodes[v].parent; // go up the tree
}
if (l > 56) {
throw std::logic_error("Code depth greater than 56!!!");
}
m_path[c] = w | (l << 56);
prev_c = c;
} else {
uint64_t pl = 0; // len is 0, good for special case in rank
m_path[c] = prev_c | (pl << 56);
}
}
}
template<class t_rank_type>
void init_node_ranks(const t_rank_type& rank) {
for (uint64_t i=0; i<m_nodes.size(); ++i) {
if (m_nodes[i].child[0] != undef) // if node is not a leaf
m_nodes[i].bv_pos_rank = rank.rank(m_nodes[i].bv_pos);
}
}
_int_tree(const _int_tree& bt) {
copy(bt);
}
void swap(_int_tree& bt) {
std::swap(m_nodes, bt.m_nodes);
std::swap(m_c_to_leaf, bt.m_c_to_leaf);
std::swap(m_path, bt.m_path);
}
_int_tree& operator=(const _int_tree& bt) {
if (this != &bt) {
copy(bt);
}
return *this;
}
//! Serializes the data structure into the given ostream
uint64_t serialize(std::ostream& out, structure_tree_node* v=nullptr,
std::string name="") const {
structure_tree_node* child = structure_tree::add_child(
v, name, util::class_name(*this));
uint64_t written_bytes = 0;
uint64_t m_nodes_size = m_nodes.size();
written_bytes += write_member(m_nodes_size, out, child, "m_nodes.size()");
written_bytes += serialize_vector(m_nodes, out, child, "m_nodes");
uint64_t m_c_to_leaf_size = m_c_to_leaf.size();
written_bytes += write_member(m_c_to_leaf_size, out, child, "m_c_to_leaf.size()");
written_bytes += serialize_vector(m_c_to_leaf, out, child, "m_c_to_leaf");
uint64_t m_path_size = m_path.size();
written_bytes += write_member(m_path_size, out, child, "m_path.size()");
written_bytes += serialize_vector(m_path, out, child, "m_path");
structure_tree::add_size(child, written_bytes);
return written_bytes;
}
//! Loads the data structure from the given istream.
void load(std::istream& in) {
uint64_t m_nodes_size = 0;
read_member(m_nodes_size, in);
m_nodes = std::vector<data_node>(m_nodes_size);
load_vector(m_nodes, in);
uint64_t m_c_to_leaf_size = 0;
read_member(m_c_to_leaf_size, in);
m_c_to_leaf = std::vector<node_type>(m_c_to_leaf_size);
load_vector(m_c_to_leaf, in);
uint64_t m_path_size = 0;
read_member(m_path_size, in);
m_path = std::vector<uint64_t>(m_path_size);
load_vector(m_path, in);
}
//! Get corresponding leaf for symbol c.
inline node_type c_to_leaf(value_type c)const {
if (c >= m_c_to_leaf.size())
return undef;
else
return m_c_to_leaf[c];
}
//! Return the root node of the tree.
inline static node_type root() {
return 0;
}
//! Return the number of nodes in the tree.
uint64_t size() const {
return m_nodes.size();
}
//! Return the parent node of v.
inline node_type parent(node_type v)const {
return m_nodes[v].parent;
}
//! Return left (i=0) or right (i=1) child node of v.
inline node_type child(node_type v, uint8_t i)const {
return m_nodes[v].child[i];
}
//! Return if v is a leaf node.
inline bool is_leaf(node_type v)const {
return m_nodes[v].child[0] == undef;
}
//! Return the path as left/right bit sequence in a uint64_t
inline uint64_t bit_path(value_type c)const {
if (c >= m_path.size()) {
return m_path.size()-1;
}
return m_path[c];
}
//! Return the start of the node in the WT's bit vector
inline uint64_t bv_pos(node_type v)const {
return m_nodes[v].bv_pos;
}
//! Returns for node v the rank of 1's up to bv_pos(v)
inline uint64_t bv_pos_rank(node_type v)const {
return m_nodes[v].bv_pos_rank;
}
//! Return if the node is a valid node
inline bool is_valid(node_type v)const {
return v != undef;
}
//! Return symbol c or the next larger symbol in the wt
inline std::pair<bool,value_type> symbol_gte(value_type c) const
{
if(c >= m_c_to_leaf.size()) {
return {false,0};
}
for(value_type i=c;i<m_c_to_leaf.size();i++) {
if(m_c_to_leaf[i]!=undef) {
return {true,i};
}
}
return {false,0};
}
//! Return symbol c or the next smaller symbol in the wt
inline std::pair<bool,value_type> symbol_lte(value_type c) const
{
if(c >= m_c_to_leaf.size()) {
// return the largest symbol
c = m_c_to_leaf.size()-1;
}
for(value_type i=c;i>0;i--) {
if(m_c_to_leaf[i]!=undef) {
return {true,i};
}
}
if(m_c_to_leaf[0]!=undef)
return {true,0};
return {false,0};
}
};
// Strategy class for tree representation of a WT
template<bool t_dfs_shape=false>
struct int_tree {
template<class t_wt>
using type = _int_tree<t_dfs_shape, t_wt>;
};
} // end namespace sdsl
#endif
|