/usr/include/falcon/tokenizer.h is in falconpl-dev 0.9.6.9-git20120606-2.1+b1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | /*
FALCON - The Falcon Programming Language
FILE: tokenizer.h
Utility to parse complex and lengty strings.
-------------------------------------------------------------------
Author: Giancarlo Niccolai
Begin: Wed, 04 Feb 2009 12:02:01 +0100
-------------------------------------------------------------------
(C) Copyright 2009: the FALCON developers (see list in AUTHORS file)
See LICENSE file for licensing details.
*/
#ifndef FLC_TOKENIZER_H
#define FLC_TOKENIZER_H
#include <falcon/setup.h>
#include <falcon/types.h>
#include <falcon/string.h>
#include <falcon/sequence.h>
#include <falcon/item.h>
namespace Falcon
{
/** Parameters for the tokenizer.
This is used for variable parameter idiom initialization of the Tokenizer class.
Pass a direct instance of this class to configure the target Tokenizer.
The setting methods in this class return a reference to this class itself,
so that is possible to set several behavior and settings in cascade.
*/
class FALCON_DYN_CLASS TokenizerParams: public BaseAlloc
{
bool m_bGroupSep;
bool m_bBindSep;
bool m_bTrim;
int32 m_nMaxToken;
bool m_bWsIsToken;
bool m_bReturnSep;
public:
TokenizerParams():
m_bGroupSep( false ),
m_bBindSep( false ),
m_bTrim( false ),
m_nMaxToken( -1 ),
m_bWsIsToken( false ),
m_bReturnSep( false )
{}
/** Activate this option to have the Tokenizer return only once for a sequence of separators all alike.
In example, if the token list includes a space, then only one token will be returned
no matter how many spaces are encountered. If not given, an empty string would be
returned as a token if two tokens are found one after another.
*/
TokenizerParams &groupSep( bool mode = true ) { m_bGroupSep = mode; return *this; }
/** Treat a sequence of whitespaces of any lenght as a single token.
This separates words between spaces and other tokens. For example,
a text analyzer may use this mode to get words and puntactions
with a single "next" call.
*/
TokenizerParams &wsIsToken( bool mode = true ) { m_bWsIsToken = mode; return *this; }
/** Add the tokens to the non-token previous element.
This adds the separators to the token preceding them when returning the token.
If grouping is activated, then more than a single separator may be returned.
*/
TokenizerParams &bindSep( bool mode = true ) { m_bBindSep = mode; return *this; }
/** Returns found tokens separately.
This forces the tokenizer to return each token in a separate call.
For example, if "," is a token:
\code
"a, b, c"
\endcode
would be returned as "a" - "," - " b" - "," - " c".
*/
TokenizerParams &returnSep( bool mode = true ) { m_bReturnSep = mode; return *this; }
/** Whitespaces are trimmed from the retuned tokens.
Whitespaces are tab, space, carrige return and line feed characters. If this
option is actived, the returned tokens won't include spaces found at the beginning or
at the end of the token. In example, if the spearator is ':', and trim is enabled,
the following sequence:
\code
: a: b : :c
\endcode
Will be parsed as a sequence of "a", "b", "", "c" tokens; otherwise, it would be
parsed as " a", " b ", " ", "c".
*/
TokenizerParams &trim( bool mode = true ) { m_bTrim = mode; return *this; }
/** Sets the maximum size of the returned tokens.
If the size of the input data exceeds this size while searching for a token, an item is returned as if a separator was found.
*/
TokenizerParams &maxToken( int32 size ) { m_nMaxToken = size; return *this; }
bool isGroupSep() const { return m_bGroupSep; }
bool isBindSep() const { return m_bBindSep; }
bool isTrim() const { return m_bTrim; }
bool isWsToken() const { return m_bWsIsToken; }
int32 maxToken() const { return m_nMaxToken; }
bool isReturnSep() const { return m_bReturnSep; }
};
/** Base tokenizer base class.
Although this class is declared as a sequence, it only supports empty() and getIterator()
operations, needed for the TRAV loop in the vm.
In future, some subclasses may support some specific operations when they are
locally buffered.
The tokenizer is designed to operate with subclasses tokenizing a string and
or a stream. The StringStream class is not used for string for performance reasons; as
a very limited subset of operations are needed (namely, get), and as the visibility of
the underlying buffer is useful (i.e. to avoid storing a local copy of the forming buffer),
the StringTokenizer has a different, optimized implementation.
The iterator generated by a Tokenizer is one-way only. Every
next() operation on an iterator invalidates the others; this means that only one iterator
at a time can be used on a tokenizer.
\note As this class is used only internally, there is no need to mark the owner
on GC as usual. All GC marking must be external (i.e. placing the item as an hidden property of the
tokenizer object).
*/
class FALCON_DYN_CLASS Tokenizer: public Sequence
{
String m_separators;
TokenizerParams m_params;
Stream *m_input;
bool m_bOwnStream;
String m_temp;
uint32 m_version;
uint32 m_nextToken;
bool m_hasCurrent;
public:
/** Creates a ROStringStream and uses that to read the source.
WARNING: the source must be granted to stay alive for the whole duration of the tokenization,
as nothing is going to create a local safe copy of the given string.
*/
Tokenizer( TokenizerParams ¶ms, const String &seps, const String &source );
Tokenizer( TokenizerParams ¶ms, const String &seps, Stream *inp=0, bool bOwn = false );
Tokenizer( const Tokenizer &other );
virtual ~Tokenizer();
/** Calling this causes a CodeError (unimplemented) to be raised.
Subclasses may provide a consistent behavior if they wish.
*/
virtual const Item &front() const;
/** Calling this causes a CodeError (unimplemented) to be raised.
Subclasses may provide a consistent behavior if they wish.
*/
virtual const Item &back() const;
/** Calling this causes a CodeError (unimplemented) to be raised.
Subclasses may provide a consistent behavior if they wish.
*/
virtual void clear();
/** Returns true if the tokenizer knows it can't return any other element. */
virtual bool empty() const;
/** Advance to a further token.
\return false if no more token can be found
*/
virtual bool next();
/** Returns the currently active range after a succesful next. */
const String &getToken() const { return m_temp; }
/** This may fail if the underlying stream doesn't support seek.
*/
virtual void rewind();
virtual Tokenizer* clone() const;
virtual void gcMark( uint32 mark ) { Sequence::gcMark( mark ); }
/** Resets the tokenizer providing new data to be tokenized.
The string \b data must stay alive as long as this parser
is used, as it is not internally copied anywhere.
*/
void parse( const String &data );
/** Resets the tokenizer providing new data to be tokenized.
If the parameter \b bOwn is true, ti
*/
void parse( Stream *in, bool bOwn = false );
/** Returns true if the tokenizer has been readied with a stream. */
bool isReady() const { return m_input != 0; }
bool hasCurrent() const { return m_hasCurrent; }
virtual void append( const Item& itm );
virtual void prepend( const Item& itm );
//========================================================
// Iterator implementation.
//========================================================
protected:
virtual void getIterator( Iterator& tgt, bool tail = false ) const;
virtual void copyIterator( Iterator& tgt, const Iterator& source ) const;
virtual void insert( Iterator &iter, const Item &data );
virtual void erase( Iterator &iter );
virtual bool hasNext( const Iterator &iter ) const;
virtual bool hasPrev( const Iterator &iter ) const;
virtual bool hasCurrent( const Iterator &iter ) const;
virtual bool next( Iterator &iter ) const;
virtual bool prev( Iterator &iter ) const;
virtual Item& getCurrent( const Iterator &iter );
virtual Item& getCurrentKey( const Iterator &iter );
virtual bool equalIterator( const Iterator &first, const Iterator &second ) const;
};
}
#endif
/* end of tokenizer.h */
|