/usr/include/falcon/tokenizer.h

/*
   FALCON - The Falcon Programming Language
   FILE: tokenizer.h

   Utility to parse complex and lengty strings.
   -------------------------------------------------------------------
   Author: Giancarlo Niccolai
   Begin: Wed, 04 Feb 2009 12:02:01 +0100

   -------------------------------------------------------------------
   (C) Copyright 2009: the FALCON developers (see list in AUTHORS file)

   See LICENSE file for licensing details.
*/

#ifndef FLC_TOKENIZER_H
#define FLC_TOKENIZER_H

#include <falcon/setup.h>
#include <falcon/types.h>
#include <falcon/string.h>
#include <falcon/sequence.h>
#include <falcon/item.h>

namespace Falcon
{

/** Parameters for the tokenizer.
   This is used for variable parameter idiom initialization of the Tokenizer class.
   Pass a direct instance of this class to configure the target Tokenizer.

   The setting methods in this class return a reference to this class itself,
   so that is possible to set several behavior and settings in cascade.
*/
class FALCON_DYN_CLASS TokenizerParams: public BaseAlloc
{
   bool m_bGroupSep;
   bool m_bBindSep;
   bool m_bTrim;
   int32 m_nMaxToken;
   bool m_bWsIsToken;
   bool m_bReturnSep;

public:
   TokenizerParams():
      m_bGroupSep( false ),
      m_bBindSep( false ),
      m_bTrim( false ),
      m_nMaxToken( -1 ),
      m_bWsIsToken( false ),
      m_bReturnSep( false )
   {}

   /** Activate this option to have the Tokenizer return only once for a sequence of separators all alike.
      In example, if the token list includes a space, then only one token will be returned
      no matter how many spaces are encountered. If not given, an empty string would be
      returned as a token if two tokens are found one after another.
   */
   TokenizerParams &groupSep( bool mode = true ) { m_bGroupSep = mode; return *this; }

   /** Treat a sequence of whitespaces of any lenght as a single token.
       This separates words between spaces and other tokens. For example,
       a text analyzer may use this mode to get words and puntactions
       with a single "next" call.
   */
   TokenizerParams &wsIsToken( bool mode = true ) { m_bWsIsToken = mode; return *this; }

   /** Add the tokens to the non-token previous element.

      This adds the separators to the token preceding them when returning the token.
      If grouping is activated, then more than a single separator may be returned.
   */
   TokenizerParams &bindSep( bool mode = true ) { m_bBindSep = mode; return *this; }

   /** Returns found tokens separately.
       This forces the tokenizer to return each token in a separate call.
       For example, if "," is a token:

       \code
         "a, b, c"
       \endcode
       would be returned as "a" - "," - " b" - "," - " c".
    */
   TokenizerParams &returnSep( bool mode = true ) { m_bReturnSep = mode; return *this; }


   /** Whitespaces are trimmed from the retuned tokens.
      Whitespaces are tab, space, carrige return and line feed characters. If this
      option is actived, the returned tokens won't include spaces found at the beginning or
      at the end of the token. In example, if the spearator is ':', and trim is enabled,
      the following sequence:
      \code
         : a: b : :c
      \endcode
      Will be parsed as a sequence of "a", "b", "", "c" tokens; otherwise, it would be
      parsed as " a", " b ", " ", "c".
   */
   TokenizerParams &trim( bool mode = true ) { m_bTrim = mode; return *this; }

   /** Sets the maximum size of the returned tokens.
      If the size of the input data exceeds this size while searching for a token, an item is returned as if a separator was found.
   */
   TokenizerParams &maxToken( int32 size ) { m_nMaxToken = size; return *this; }

   bool isGroupSep() const { return m_bGroupSep; }
   bool isBindSep() const { return m_bBindSep; }
   bool isTrim() const { return m_bTrim; }
   bool isWsToken() const { return m_bWsIsToken; }
   int32 maxToken() const { return m_nMaxToken; }
   bool isReturnSep() const { return m_bReturnSep; }
};


/** Base tokenizer base class.
   Although this class is declared as a sequence, it only supports empty() and getIterator()
   operations, needed for the TRAV loop in the vm.

   In future, some subclasses may support some specific operations when they are
   locally buffered.

   The tokenizer is designed to operate with subclasses tokenizing a string and
   or a stream. The StringStream class is not used for string for performance reasons; as
   a very limited subset of operations are needed (namely, get), and as the visibility of
   the underlying buffer is useful (i.e. to avoid storing a local copy of the forming buffer),
   the StringTokenizer has a different, optimized implementation.

   The iterator generated by a Tokenizer is one-way only. Every
   next() operation on an iterator invalidates the others; this means that only one iterator
   at a time can be used on a tokenizer.

   \note As this class is used only internally, there is no need to mark the owner
         on GC as usual. All GC marking must be external (i.e. placing the item as an hidden property of the
         tokenizer object).
*/
class FALCON_DYN_CLASS Tokenizer: public Sequence
{
   String m_separators;
   TokenizerParams m_params;

   Stream *m_input;
   bool m_bOwnStream;

   String m_temp;
   uint32 m_version;

   uint32 m_nextToken;
   bool m_hasCurrent;

public:

   /** Creates a ROStringStream and uses that to read the source.
      WARNING: the source must be granted to stay alive for the whole duration of the tokenization,
      as nothing is going to create a local safe copy of the given string.
   */
   Tokenizer( TokenizerParams &params, const String &seps, const String &source );
   Tokenizer( TokenizerParams &params, const String &seps, Stream *inp=0, bool bOwn = false );
   Tokenizer( const Tokenizer &other );


   virtual ~Tokenizer();

   /** Calling this causes a CodeError (unimplemented) to be raised.
      Subclasses may provide a consistent behavior if they wish.
   */
   virtual const Item &front() const;

   /** Calling this causes a CodeError (unimplemented) to be raised.
      Subclasses may provide a consistent behavior if they wish.
   */
   virtual const Item &back() const;

   /** Calling this causes a CodeError (unimplemented) to be raised.
      Subclasses may provide a consistent behavior if they wish.
   */
   virtual void clear();

   /** Returns true if the tokenizer knows it can't return any other element. */
   virtual bool empty() const;

   /** Advance to a further token.
      \return false if no more token can be found
   */
   virtual bool next();

   /** Returns the currently active range after a succesful next. */
   const String &getToken() const { return m_temp; }

   /** This may fail if the underlying stream doesn't support seek.
   */
   virtual void rewind();

   virtual Tokenizer* clone() const;
   virtual void gcMark( uint32 mark ) { Sequence::gcMark( mark ); }

   /** Resets the tokenizer providing new data to be tokenized.
      The string \b data must stay alive as long as this parser
      is used, as it is not internally copied anywhere.
   */
   void parse( const String &data );

   /** Resets the tokenizer providing new data to be tokenized.
      If the parameter \b bOwn is true, ti
   */
   void parse( Stream *in, bool bOwn = false );

   /** Returns true if the tokenizer has been readied with a stream. */
   bool isReady() const { return m_input != 0; }

   bool hasCurrent() const { return m_hasCurrent; }

   virtual void append( const Item& itm );
   virtual void prepend( const Item& itm );

   //========================================================
   // Iterator implementation.
   //========================================================
protected:

   virtual void getIterator( Iterator& tgt, bool tail = false ) const;
   virtual void copyIterator( Iterator& tgt, const Iterator& source ) const;
   virtual void insert( Iterator &iter, const Item &data );
   virtual void erase( Iterator &iter );
   virtual bool hasNext( const Iterator &iter ) const;
   virtual bool hasPrev( const Iterator &iter ) const;
   virtual bool hasCurrent( const Iterator &iter ) const;
   virtual bool next( Iterator &iter ) const;
   virtual bool prev( Iterator &iter ) const;
   virtual Item& getCurrent( const Iterator &iter );
   virtual Item& getCurrentKey( const Iterator &iter );
   virtual bool equalIterator( const Iterator &first, const Iterator &second ) const;
};



}

#endif

/* end of tokenizer.h */
falconpl-dev 0.9.6.9-git20120606-2.1+b1 / usr / include / falcon / tokenizer.h