/usr/lib/falcon/parser/genparser.fal

/*
   FALCON - Generic Parser

   FILE: parser.fal

   Generic customizable grammar parser.
   -------------------------------------------------------------------
   Author: Giancarlo Niccolai
   Begin: Sat, 30 Aug 2010 09:42:22 +0200

   -------------------------------------------------------------------
   (C) Copyright 2010: the FALCON developers (see list in AUTHORS file)

   See LICENSE file for licensing details.
*/

directive version = 0x010001

import Context from self.context as Context
import Node from self.node as Node
import DummyRule from self.rules


//# Parser states (behaviors)

class PState( rules, onUnmatched, eolRule )
   rules = rules
   onUnmatched = onUnmatched
   eolRule = eolRule
end

/*#
   @brief Main parsing class.

   A generic parser is a parsing system based on token recognition
   and callback invocation. This class provides general token
   recognition and calls adequate callbacks.

   In strick terms, this class acts more like a @b lexer which drives
   a callback-driven parser provided by the user.

   The @b Parser class works by using a set of named states (in the
   class @a PState), which corespond to different ruleset which are
   used in different parsing steps.

   Rules, that is, user-provided callbacks, can return a string
   representing a state name; the coresponding satate is then pushed
   on a stack, and can be popped later by returning the special
   state "#pop".

   Each @a PState provides a set of four elements:
      - separators: a string containing characters considered "blanks"
         which hasn't any grammar meaning except that of separating
         tokens and keywords. In example, " \t" would recognize spaces
         and tab characters as separators.
      - tokens: A list of @a Rule instances which match recognized tokens.
         Tokens are recognized when it can be demonstrated that no other
         token can be matched. In example, if the parser has "+" and "+="
         tokens, the proper action (either the one associated with + or +=)
         will be called when the ambiguity can be resolved.
      - keywords: A list of @a Rule instances which are recognized only if
         surrounded by separators or tokens. In example, a keyword "fun"
         would be recognized only if preceded and followed by any of the
         characters in the @a seperators string, or by any recognized token.
      - onElement: it's a callback which gets called when identifying a sequence
        between separators and/or tokens which is not listed in the keywords.

   In example, the following parser regognizes and saves strings by pushing a different
   state when they are found:

   @code
   load parser

   // base state
   base = PState(
      separators|" \t\n\r",
      tokens| [ Rule( '"', onQuote ) ],
      onElement|ignoring )

   // state when parsing a string
   pstring = PState(
      separators|"", // spaces are relevant
      tokens|[
         Rule( "\\\"", onEscapeQuote ),
         Rule( "\"", onStringClosed ) ],
      onElement| addToString )

   content = ""
   function onQuote( qt )
      global content
      // starting string mode
      content = ""
      return "string"
   end

   function ignoring( item )
      > "Ignoring ", item
   end

   function addToString( data )
      global content
      content += data
   end

   function onEscapeQuote( qt )
      global content
      content += "\""
   end

   function onStringClosed( qt )
      global content
      > "Found string: >>", content, "<<"
      return "#pop"
   end

   parser = Parser()
   parser.addState( "base", base )
   parser.addState( "string", pstring )

   stream = StringStream( 'unrecognized text, "an \"interesting\" string", other text')
   parser.parse( stream, "base" )
   @endcode

   The parser ensures that all the tokens, keywords and unrecognized elements
   are notified via callbacks in the same order in which they are found in the
   parsed file.

   Other than returning a state name, the rule callbacks may also return a new
   instance of @a PState; this would push the state on the stack and make it
   the currently active state. This makes possible to create new parser states
   on the fly, in case some of the parsing conditions are not known in advance.

   It is also possible to modify existing states by i.e. adding new keywords or
   tokens as they become defined while parsing the source code.
*/

class Parser()
   //# known states by name
   states = nil
   
   //# stack of ParserStates
   _stack = []

   //# Used for tracing
   _stateNames = []

   //# false when asked to quit
   _proceed = false

   //# current parsing row
   row = 1

   //# true to activate rule tracing
   trace = false

   /*# Pushes a given state making it the active state.
   *  @param name The name of the state that should be pushed.
   *  @raise ParseError if the state name is not known.
   */
   function pushState( name )
      if not name in self.states
         raise ParseError( 10000, i"Setting invalid state", name )
      end

      self._stack += self.states[name]
      self._stateNames += name
      if self.trace
         self.traceStates()
      end
   end

   /*#
      @brief Pops current state.
      @raise ParseError if the state backlist is empty.
   */
   function popState()
      len = self._stack.len()
      if len <= 1
         raise ParseError( 10001, i"Cannot pop topmost state" )
      end

      arrayResize( self._stack, len - 1 )
      arrayResize( self._stateNames, len - 1 )
      if self.trace
         self.traceStates()
      end
   end

   /*#
      @brief Performs complete parsing.
      @param stream An input stream where the input data is read.
      @param initState The name of the initial state.
      @optparam initRow If given, set the initial row to this number
      @raise ParseError if the parser has not been correctly initialized.
   */
   function parse( string, initState, initRow )
      if not initState: initState = "start"
      self.initParser(initState)
      if initRow: self.row = initRow
      
      ctx = Context()
      pos = 0
      strlen = string.len()

      loop
         // find the next line
         poseol = string.find( "\n", pos )
         eolsize = 1

         // consider \r\n or \n\r
         if poseol > 0
            if string[* (poseol-1)] == 0xD
               --poseol
               ++eolsize
            elif poseol+1 < strlen and string[* (poseol+1)] == 0xD
               ++eolsize
            end
         end
         
         line = string[pos: (poseol >= 0 ? poseol: strlen)]
         self.parseLine( line, ctx )

         if poseol < 0: break
         pos = poseol + eolsize
      end 
      
      return ctx
   end

   //# Parses a file
   function parseStream( stream, initState )
      self.initParser( initState )
      ctx = Context()
      line = ""
      while stream.readLine( line, 4096 )
         self.parseLine( line, ctx )
      end

      return ctx
   end
   
   function initParser(initState)
      if initState
         self._stack = []
         if self.trace: self._stateNames = []
         self.pushState( initState )
      elif self._stack.len() != 1
         raise ParseError( 10002, i"Parser not initialized before parsing" )
      end

      self._proceed = true
      self.row = 1
   end

   function parseLine( line, ctx )
      linelen = line.len()
      col = 0
      
      hadWinner = false
      if self.trace: self.traceText( line )
      atEnd = false

      loop
         matchAt = linelen
         winner = nil

         state = self._stack[-1]
         sr = state.rules

         ruleId = 0
         srlen = sr.len()
         while ruleId < srlen
            rule =  sr[ruleId++]
            rulePos = rule.match( line, col )
            if rulePos == 0
               // we found the rule
               winner = rule
               hadWinner = true
               matchAt = 0
               break
            elif rulePos > 0
               if rulePos < matchAt
                  winner = rule
                  matchAt = rulePos
                  hadWinner = true
               elif rulePos == matchAt and not winner
                  // match at end
                  winner = rule
                  hadWinner = true
               end
            end
         end

         //we found it -- do we have a prefix?
         if col < matchAt
            // else, the rule doesn't match
            prefix = line[col:matchAt]

            if (not state.onUnmatched)
               statedesc = ", ".merge(self._stateNames)
               raise ParseError( 10003, i"Unrecognized text", @"[$(self.row):$(col)]{$(statedesc)} \"$prefix\"" )
            end
            if state.onUnmatched != "*": state.onUnmatched( ctx, prefix )
         end

         if winner
            count = winner.apply( ctx )
            if self.trace: self.traceRule( winner, ctx )
            self._fulfil( winner.nextState )
            // ensure that end of lines with look-ahead are respected
            if winner.lookAhead: atEnd = false
         else
            // empty line and no matching rule, so it seems.
            break
         end

         col = matchAt + count

         // are we done?
         if col > linelen
            break
         elif col == linelen
            // offer an extra loop for rules matching at end of line
            if atEnd: break
            atEnd = true
         end
      end

      if state.eolRule and (not winner or winner.nextState == "#stay")
         // is this a conditional rule?
         if state.eolRule[0] == "*"
            if not hadWinner
               if self.trace: self.traceMsg("Following optional eolRule")
               self._fulfil( state.eolRule[1:] )
            end
         else
            if self.trace: self.traceMsg("Following mandatory eolRule")
            self._fulfil( state.eolRule )
         end
      end

      ++self.row
      ctx.row = self.row
   end


   function _fulfil( slist )
      states = slist.split( ";")

      si = 0
      l = states.len()
      while si < l
         what = states[si++]
         if what == "#pop"
            self.popState()
         elif what == "#stay"
            if self.trace: self.traceMsg( "Staying in current state" )
            return
         elif what == "#quit"
            self.terminate()
         elif what.typeId() == StringType
            self.pushState( what )
         elif what.derivedFrom( PState )
            self._stack += what
            self._stateNames += "anon(" + what.toString()+ ")"
            if self.trace
               self.traceStates()
            end
         else
            // otherwise we have something wrong
            raise ParseError( 10004, i"Invalid state change", what )
         end
      end
   end

   /*#
      @brief Request parsing termination.

      A rule callback may call this method via @b sender.terminate()
      to request immediate exit from the parser sequence.

      A rule may also quit the parser by returning the special
      "#quit" state.
   */

   function terminate()
      self._proceed = false
   end

   function traceText( line )
      > "[PTRC:text] <<", line, ">>"
   end

   function traceStates()
      > "[PTRC:stat] ", ", ".merge(self._stateNames)
   end

   function traceMsg( msg )
      > "[PTRC:msg ] ", msg
   end

   function traceRule( r, context )
      >> "[PTRC:rule] (ctx depth:", context.nodeStack.len(),  ") Applying ", r.className(), "("
      if r provides token: >> '"', r.token.escape(), "\", "
      >> '"', r.nextState, '")'
      > " matcing ", r.matchLen, " chars."
   end
   
end

export
libfalcon-engine1 0.9.6.9-git20120606-2.1+b1 / usr / lib / falcon / parser / genparser.fal