/usr/include/shogun/lib/DelimiterTokenizer.h is in libshogun-dev 3.2.0-7.3build4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * Written (W) 2013 Evangelos Anagnostopoulos
 * Copyright (C) 2013 Evangelos Anagnostopoulos
 */
#ifndef _DELIMITERTOKENIZER__H__
#define	_DELIMITERTOKENIZER__H__
#include <shogun/lib/Tokenizer.h>
namespace shogun
{
class CTokenizer;
/** @brief The class CDelimiterTokenizer is used to tokenize
 *  a SGVector<char> into tokens using custom chars as delimiters.
 *  One can set the delimiters to use by setting to 1 the appropriate
 *  index of the public field delimiters. Eg. to set as delimiter the
 *  character ':', one should do: tokenizer->delimiters[':'] = 1;
 */
class CDelimiterTokenizer: public CTokenizer
{
public:
	/** default constructor
	 *
	 * @param skip_delimiters whether to skip consecutive delimiters or not
	 */
	CDelimiterTokenizer(bool skip_delimiters = false);
	/** copy constructor
	 *
	 * @param orig the original DelimiterTokenizer
	 */
	CDelimiterTokenizer(const CDelimiterTokenizer& orig);
	/** destructor */
	virtual ~CDelimiterTokenizer() {}
	/** Set the char array that requires tokenization
	 *
	 * @param txt the text to tokenize
	 */
	virtual void set_text(SGVector<char> txt);
	/** Returns true or false based on whether
	 * there exists another token in the text
	 *
	 * @return if another token exists
	 */
	virtual bool has_next();
	/** Method that returns the indices, start and end, of
	 *  the next token in line.
	 *  If next_token starts with a delimiter and skip_consecutive_delimiters is false,
	 *  it returns the same indices for start and end.
	 *
	 * @param start token's starting index
	 * @return token's ending index (exclusive)
	 */
	virtual index_t next_token_idx(index_t& start);
	/** Returns the name of the SGSerializable instance.  It MUST BE
	 * the CLASS NAME without the prefixed 'C'.
	 *
	 * @return name of the SGSerializable
	 */
	virtual const char* get_name() const;
	/** Makes the tokenizer to use ' ' or '\\t'
	 *  as the delimiters for the tokenization process;
	 */
	void init_for_whitespace();
	CDelimiterTokenizer* get_copy();
	/** Resets the delimiters */
	void clear_delimiters();
	/** Get skip_consecutive_delimiters
	 *
	 * @return if skip consecutive delimiters is set
	 */
	bool get_skip_delimiters() const;
	/** set value for skip_consecutive_delimiters
	 *
	 * @param skip_delimiters whether to skip or not consecutive delimiters
	 */
	void set_skip_delimiters(bool skip_delimiters);
private:
	void init();
public:
	/** delimiters */
	SGVector<bool> delimiters;
protected:
	/** index of last token */
	index_t last_idx;
	/** whether to skip consecutive delimiters or not */
	bool skip_consecutive_delimiters;
};
}
#endif	/* _WHITESPACETOKENIZER__H__ */
 |