/usr/include/CLucene/index/TermVector.h is in libclucene-dev 0.9.21b-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | /*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_index_termvector_h
#define _lucene_index_termvector_h
#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif
#include "FieldInfos.h"
CL_NS_DEF(index)
//predefine classes
struct TermVectorOffsetInfo;
class TermPositionVector;
/** Provides access to stored term vector of
* a document field.
*/
class TermFreqVector:LUCENE_BASE {
public:
virtual ~TermFreqVector(){
}
/**
*
* @return The field this vector is associated with.
*
*/
virtual const TCHAR* getField() = 0;
/**
* @return The number of terms in the term vector.
*/
virtual int32_t size() = 0;
/**
* @return An Array of term texts in ascending order.
*/
virtual const TCHAR** getTerms() = 0;
/** Array of term frequencies. Locations of the array correspond one to one
* to the terms in the array obtained from <code>getTerms</code>
* method. Each location in the array contains the number of times this
* term occurs in the document or the document field.
*
* The size of the returned array is size()
* @memory Returning a pointer to internal data. Do not delete.
*/
virtual const Array<int32_t>* getTermFrequencies() = 0;
/** Return an index in the term numbers array returned from
* <code>getTerms</code> at which the term with the specified
* <code>term</code> appears. If this term does not appear in the array,
* return -1.
*/
virtual int32_t indexOf(const TCHAR* term) = 0;
/** Just like <code>indexOf(int32_t)</code> but searches for a number of terms
* at the same time. Returns an array that has the same size as the number
* of terms searched for, each slot containing the result of searching for
* that term number.
*
* @param terms array containing terms to look for
* @param start index in the array where the list of terms starts
* @param len the number of terms in the list
*/
virtual void indexesOf(const TCHAR** terms, const int32_t start, const int32_t len, Array<int32_t>& ret) = 0;
/** Solve the diamond inheritence problem by providing a reinterpret function.
* No dynamic casting is required and no RTTI data is needed to do this
*/
virtual TermPositionVector* __asTermPositionVector()=0;
};
/**
* Writer works by opening a document and then opening the fields within the document and then
* writing out the vectors for each field.
*
* Rough usage:
*
<CODE>
for each document
{
writer.openDocument();
for each field on the document
{
writer.openField(field);
for all of the terms
{
writer.addTerm(...)
}
writer.closeField
}
writer.closeDocument()
}
</CODE>
*/
class TermVectorsWriter:LUCENE_BASE {
private:
class TVField:LUCENE_BASE {
public:
int32_t number;
int64_t tvfPointer;
int32_t length; // number of distinct term positions
bool storePositions;
bool storeOffsets;
TVField(int32_t number, bool storePos, bool storeOff):
tvfPointer(0),length(0){
this->number = number;
this->storePositions = storePos;
this->storeOffsets = storeOff;
}
~TVField(){}
};
class TVTerm:LUCENE_BASE {
const TCHAR* termText;
int32_t termTextLen; //textlen cache
public:
TVTerm();
~TVTerm();
int32_t freq;
Array<int32_t>* positions;
Array<TermVectorOffsetInfo>* offsets;
const TCHAR* getTermText() const;
size_t getTermTextLen();
void setTermText(const TCHAR* val);
};
CL_NS(store)::IndexOutput* tvx, *tvd, *tvf;
CL_NS(util)::CLVector<TVField*,CL_NS(util)::Deletor::Object<TVField> > fields;
CL_NS(util)::CLVector<TVTerm*,CL_NS(util)::Deletor::Object<TVTerm> > terms;
FieldInfos* fieldInfos;
TVField* currentField;
int64_t currentDocPointer;
void addTermInternal(const TCHAR* termText, const int32_t freq,
Array<int32_t>* positions, Array<TermVectorOffsetInfo>* offsets);
void writeField();
void writeDoc();
void openField(int32_t fieldNumber, bool storePositionWithTermVector,
bool storeOffsetWithTermVector);
public:
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_VERSION = 2);
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_SIZE = 4);
LUCENE_STATIC_CONSTANT(uint8_t, STORE_POSITIONS_WITH_TERMVECTOR = 0x1);
LUCENE_STATIC_CONSTANT(uint8_t, STORE_OFFSET_WITH_TERMVECTOR = 0x2);
static const char* LUCENE_TVX_EXTENSION;
static const char* LUCENE_TVD_EXTENSION;
static const char* LUCENE_TVF_EXTENSION;
TermVectorsWriter(CL_NS(store)::Directory* directory, const char* segment,
FieldInfos* fieldInfos);
~TermVectorsWriter();
void openDocument();
void closeDocument();
/** Close all streams. */
void close();
bool isDocumentOpen() const;
/** Start processing a field. This can be followed by a number of calls to
* addTerm, and a final call to closeField to indicate the end of
* processing of this field. If a field was previously open, it is
* closed automatically.
*/
void openField(const TCHAR* field);
/** Finished processing current field. This should be followed by a call to
* openField before future calls to addTerm.
*/
void closeField();
/** Return true if a field is currently open. */
bool isFieldOpen() const;
/**
* Add a complete document specified by all its term vectors. If document has no
* term vectors, add value for tvx.
*
* @param vectors
* @throws IOException
*/
void addAllDocVectors(Array<TermFreqVector*>& vectors);
/** Add term to the field's term vector. Field must already be open.
* Terms should be added in
* increasing order of terms, one call per unique termNum. ProxPointer
* is a pointer into the TermPosition file (prx). Freq is the number of
* times this term appears in this field, in this document.
* @throws IllegalStateException if document or field is not open
*/
void addTerm(const TCHAR* termText, int32_t freq,
Array<int32_t>* positions = NULL, Array<TermVectorOffsetInfo>* offsets = NULL);
};
/**
*/
class SegmentTermVector: public virtual TermFreqVector {
private:
const TCHAR* field;
TCHAR** terms;
int32_t termsLen; //cache
Array<int32_t>* termFreqs;
int32_t binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const;
public:
//note: termFreqs must be the same length as terms
SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs);
virtual ~SegmentTermVector();
/**
*
* @return The number of the field this vector is associated with
*/
const TCHAR* getField();
TCHAR* toString() const;
int32_t size();
const TCHAR** getTerms();
const Array<int32_t>* getTermFrequencies();
int32_t indexOf(const TCHAR* termText);
void indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret);
virtual TermPositionVector* __asTermPositionVector();
};
/**
* @version $Id:
*/
class TermVectorsReader:LUCENE_BASE {
private:
FieldInfos* fieldInfos;
CL_NS(store)::IndexInput* tvx;
CL_NS(store)::IndexInput* tvd;
CL_NS(store)::IndexInput* tvf;
int64_t _size;
int32_t tvdFormat;
int32_t tvfFormat;
int32_t checkValidFormat(CL_NS(store)::IndexInput* in);
void readTermVectors(const TCHAR** fields, const int64_t* tvfPointers, const int32_t len, Array<TermFreqVector*>& _return);
/**
*
* @param field The field to read in
* @param tvfPointer The pointer within the tvf file where we should start reading
* @return The TermVector located at that position
* @throws IOException
*/
SegmentTermVector* readTermVector(const TCHAR* field, const int64_t tvfPointer);
int64_t size();
DEFINE_MUTEX(THIS_LOCK)
TermVectorsReader(const TermVectorsReader& copy);
public:
TermVectorsReader(CL_NS(store)::Directory* d, const char* segment, FieldInfos* fieldInfos);
~TermVectorsReader();
void close();
TermVectorsReader* clone() const;
/**
* Retrieve the term vector for the given document and field
* @param docNum The document number to retrieve the vector for
* @param field The field within the document to retrieve
* @return The TermFreqVector for the document and field or null if there is no termVector for this field.
* @throws IOException if there is an error reading the term vector files
*/
TermFreqVector* get(const int32_t docNum, const TCHAR* field);
/**
* Return all term vectors stored for this document or null if the could not be read in.
*
* @param docNum The document number to retrieve the vector for
* @return All term frequency vectors
* @throws IOException if there is an error reading the term vector files
*/
bool get(int32_t docNum, Array<TermFreqVector*>& result);
};
struct TermVectorOffsetInfo {
int startOffset;
int endOffset;
public:
static Array<TermVectorOffsetInfo> EMPTY_OFFSET_INFO;
TermVectorOffsetInfo();
~TermVectorOffsetInfo();
TermVectorOffsetInfo(int32_t startOffset, int32_t endOffset);
int32_t getEndOffset() const;
void setEndOffset(int32_t endOffset);
int32_t getStartOffset() const;
void setStartOffset(int32_t startOffset);
bool equals(TermVectorOffsetInfo* o);
size_t hashCode() const;
};
/** Extends <code>TermFreqVector</code> to provide additional information about
* positions in which each of the terms is found. A TermPositionVector not necessarily
* contains both positions and offsets, but at least one of these arrays exists.
*/
class TermPositionVector: public virtual TermFreqVector {
public:
/** Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
* term String array obtained from the <code>indexOf</code> method.
* May return null if positions have not been stored.
*/
virtual Array<int32_t>* getTermPositions(int32_t index) = 0;
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
* May return null if offsets have not been stored.
*
* @see org.apache.lucene.analysis.Token
*
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
*/
virtual Array<TermVectorOffsetInfo>* getOffsets(int32_t index) = 0;
virtual ~TermPositionVector(){
}
};
class SegmentTermPositionVector: public SegmentTermVector, public TermPositionVector {
protected:
Array< Array<int32_t> >* positions;
Array< Array<TermVectorOffsetInfo> >* offsets;
static Array<int32_t> EMPTY_TERM_POS;
public:
SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions, Array< Array<TermVectorOffsetInfo> >* offsets);
~SegmentTermPositionVector();
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
*
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
* @see org.apache.lucene.analysis.Token
*/
Array<TermVectorOffsetInfo>* getOffsets(int32_t index);
/**
* Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
* term String array obtained from the <code>indexOf</code> method.
*/
Array<int32_t>* getTermPositions(int32_t index);
const TCHAR* getField(){ return SegmentTermVector::getField(); }
TCHAR* toString() const{ return SegmentTermVector::toString(); }
int32_t size(){ return SegmentTermVector::size(); }
const TCHAR** getTerms(){ return SegmentTermVector::getTerms(); }
const Array<int32_t>* getTermFrequencies(){ return SegmentTermVector::getTermFrequencies(); }
int32_t indexOf(const TCHAR* termText){ return SegmentTermVector::indexOf(termText); }
void indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret)
{ SegmentTermVector::indexesOf(termNumbers, start, len, ret); }
virtual TermPositionVector* __asTermPositionVector();
};
CL_NS_END
#endif
|