/usr/include/shogun/lib/DelimiterTokenizer.h is in libshogun-dev 3.1.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2013 Evangelos Anagnostopoulos
* Copyright (C) 2013 Evangelos Anagnostopoulos
*/
#ifndef _DELIMITERTOKENIZER__H__
#define _DELIMITERTOKENIZER__H__
#include <shogun/lib/Tokenizer.h>
namespace shogun
{
class CTokenizer;
/** @brief The class CDelimiterTokenizer is used to tokenize
* a SGVector<char> into tokens using custom chars as delimiters.
* One can set the delimiters to use by setting to 1 the appropriate
* index of the public field delimiters. Eg. to set as delimiter the
* character ':', one should do: tokenizer->delimiters[':'] = 1;
*/
class CDelimiterTokenizer: public CTokenizer
{
public:
/** default constructor
*
* @param skip_delimiters whether to skip consecutive delimiters or not
*/
CDelimiterTokenizer(bool skip_delimiters = false);
/** copy constructor
*
* @param orig the original DelimiterTokenizer
*/
CDelimiterTokenizer(const CDelimiterTokenizer& orig);
/** destructor */
virtual ~CDelimiterTokenizer() {}
/** Set the char array that requires tokenization
*
* @param txt the text to tokenize
*/
virtual void set_text(SGVector<char> txt);
/** Returns true or false based on whether
* there exists another token in the text
*
* @return if another token exists
*/
virtual bool has_next();
/** Method that returns the indices, start and end, of
* the next token in line.
* If next_token starts with a delimiter and skip_consecutive_delimiters is false,
* it returns the same indices for start and end.
*
* @param start token's starting index
* @return token's ending index (exclusive)
*/
virtual index_t next_token_idx(index_t& start);
/** Returns the name of the SGSerializable instance. It MUST BE
* the CLASS NAME without the prefixed 'C'.
*
* @return name of the SGSerializable
*/
virtual const char* get_name() const;
/** Makes the tokenizer to use ' ' or '\\t'
* as the delimiters for the tokenization process;
*/
void init_for_whitespace();
CDelimiterTokenizer* get_copy();
/** Resets the delimiters */
void clear_delimiters();
/** Get skip_consecutive_delimiters
*
* @return if skip consecutive delimiters is set
*/
bool get_skip_delimiters() const;
/** set value for skip_consecutive_delimiters
*
* @param skip_delimiters whether to skip or not consecutive delimiters
*/
void set_skip_delimiters(bool skip_delimiters);
private:
void init();
public:
/** delimiters */
SGVector<bool> delimiters;
protected:
/** index of last token */
index_t last_idx;
/** whether to skip consecutive delimiters or not */
bool skip_consecutive_delimiters;
};
}
#endif /* _WHITESPACETOKENIZER__H__ */
|