/usr/lib/ruby/1.8/html/htmltokenizer.rb

# = HTMLTokenizer
#
# Author::    Ben Giddings  (mailto:bg-rubyforge@infofiend.com)
# Copyright:: Copyright (c) 2004 Ben Giddings
# License::   Distributes under the same terms as Ruby
#
#
# This is a partial port of the functionality behind Perl's TokeParser
# Provided a page it progressively returns tokens from that page
#
# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $

#
# A class to tokenize HTML.
#
# Example:
#
#   page = "<HTML>
#   <HEAD>
#   <TITLE>This is the title</TITLE>
#   </HEAD>
#    <!-- Here comes the <a href=\"missing.link\">blah</a>
#    comment body
#     -->
#    <BODY>
#      <H1>This is the header</H1>
#      <P>
#        This is the paragraph, it contains
#        <a href=\"link.html\">links</a>,
#        <img src=\"blah.gif\" optional alt='images
#        are
#        really cool'>.  Ok, here is some more text and
#        <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
#      </P>
#    </body>
#    </HTML>
#    "
#    toke = HTMLTokenizer.new(page)
#
#    assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
#    assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
#    assert("links" == toke.getTrimmedText)
#    assert(toke.getTag("IMG", "A").attr_hash['optional'])
#    assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
#
class HTMLTokenizer
  @@version = 1.0

  # Get version of HTMLTokenizer lib
  def self.version
    @@version
  end

  attr_reader :page

  # Create a new tokenizer, based on the content, used as a string.
  def initialize(content)
    @page = content.to_s
    @cur_pos = 0
  end

  # Reset the parser, setting the current position back at the stop
  def reset
    @cur_pos = 0
  end

  # Look at the next token, but don't actually grab it
  def peekNextToken
    if @cur_pos == @page.length then return nil end

    if ?< == @page[@cur_pos]
      # Next token is a tag of some kind
      if '!--' == @page[(@cur_pos + 1), 3]
        # Token is a comment
        tag_end = @page.index('-->', (@cur_pos + 1))
        if tag_end.nil?
          raise "No end found to started comment:\n#{@page[@cur_pos,80]}"
        end
        # p @page[@cur_pos .. (tag_end+2)]
        HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
      else
        # Token is a html tag
        tag_end = @page.index('>', (@cur_pos + 1))
        if tag_end.nil?
          raise "No end found to started tag:\n#{@page[@cur_pos,80]}"
        end
        # p @page[@cur_pos .. tag_end]
        HTMLTag.new(@page[@cur_pos .. tag_end])
      end
    else
      # Next token is text
      text_end = @page.index('<', @cur_pos)
      text_end = text_end.nil? ? -1 : (text_end - 1)
      # p @page[@cur_pos .. text_end]
      HTMLText.new(@page[@cur_pos .. text_end])
    end
  end

  # Get the next token, returns an instance of
  # * HTMLText
  # * HTMLToken
  # * HTMLTag
  def getNextToken
    token = peekNextToken
    if token
      # @page = @page[token.raw.length .. -1]
      # @page.slice!(0, token.raw.length)
      @cur_pos += token.raw.length
    end
    #p token
    #print token.raw
    return token
  end

  # Get a tag from the specified set of desired tags.
  # For example:
  # <tt>foo =  toke.getTag("h1", "h2", "h3")</tt>
  # Will return the next header tag encountered.
  def getTag(*sought_tags)
    sought_tags.collect! {|elm| elm.downcase}

    while (tag = getNextToken)
      if tag.kind_of?(HTMLTag) and
          (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
        break
      end
    end
    tag
  end

  # Get all the text between the current position and the next tag
  # (if specified) or a specific later tag
  def getText(until_tag = nil)
    if until_tag.nil?
      if ?< == @page[@cur_pos]
        # Next token is a tag, not text
        ""
      else
        # Next token is text
        getNextToken.text
      end
    else
      ret_str = ""

      while (tag = peekNextToken)
        if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
          break
        end

        if ("" != tag.text)
          ret_str << (tag.text + " ")
        end
        getNextToken
      end

      ret_str
    end
  end

  # Like getText, but squeeze all whitespace, getting rid of
  # leading and trailing whitespace, and squeezing multiple
  # spaces into a single space.
  def getTrimmedText(until_tag = nil)
    getText(until_tag).strip.gsub(/\s+/m, " ")
  end

end

# The parent class for all three types of HTML tokens
class HTMLToken
  attr_accessor :raw

  # Initialize the token based on the raw text
  def initialize(text)
    @raw = text
  end

  # By default, return exactly the string used to create the text
  def to_s
    raw
  end

  # By default tokens have no text representation
  def text
    ""
  end

  def trimmed_text
    text.strip.gsub(/\s+/m, " ")
  end

  # Compare to another based on the raw source
  def ==(other)
    raw == other.to_s
  end
end

# Class representing text that isn't inside a tag
class HTMLText < HTMLToken
  def text
    raw
  end
end

# Class representing an HTML comment
class HTMLComment < HTMLToken
  attr_accessor :contents
  def initialize(text)
    super(text)
    temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
    if temp_arr[0].nil?
      raise "Text passed to HTMLComment.initialize is not a comment"
    end

    @contents = temp_arr[0][0]
  end
end

# Class representing an HTML tag
class HTMLTag < HTMLToken
  attr_reader :end_tag, :tag_name
  def initialize(text)
    super(text)
    if ?< != text[0] or ?> != text[-1]
      raise "Text passed to HTMLComment.initialize is not a comment"
    end

    @attr_hash = Hash.new
    @raw = text

    tag_name = text.scan(/[\w:-]+/)[0]
    if tag_name.nil?
      raise "Error, tag is nil: #{tag_name}"
    end

    if ?/ == text[1]
      # It's an end tag
      @end_tag = true
      @tag_name = '/' + tag_name.downcase
    else
      @end_tag = false
      @tag_name = tag_name.downcase
    end

    @hashed = false
  end

  # Retrieve a hash of all the tag's attributes.
  # Lazily done, so that if you don't look at a tag's attributes
  # things go quicker
  def attr_hash
    # Lazy initialize == don't build the hash until it's needed
    if !@hashed
      if !@end_tag
        # Get the attributes
        attr_arr = @raw.scan(/<[\w:-]+\s+(.*)>/m)[0]
        if attr_arr.kind_of?(Array)
          # Attributes found, parse them
          attrs = attr_arr[0]
          attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
          # clean up the array by:
          # * setting all nil elements to true
          # * removing enclosing quotes
          attr_arr.each {
            |item|
            val = if item[1].nil?
                    item[0]
                  elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
                    item[1][1 .. -2]
                  else
                    item[1]
                  end
            @attr_hash[item[0].downcase] = val
          }
        end
      end
      @hashed = true
    end

    #p self

    @attr_hash
  end

  # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
  def text
    if !end_tag
      case tag_name
      when 'img'
        if !attr_hash['alt'].nil?
          return attr_hash['alt']
        end
      when 'applet'
        if !attr_hash['alt'].nil?
          return attr_hash['alt']
        end
      end
    end
    return ''
  end
end

if $0 == __FILE__
  require 'test/unit'

  class TC_TestHTMLTokenizer < Test::Unit::TestCase
    def test_bad_link
      toke = HTMLTokenizer.new("<p><a href=http://bad.com/link>foo</a></p>")
      assert("http://bad.com/link" == toke.getTag("a").attr_hash['href'])
    end

    def test_namespace
      toke = HTMLTokenizer.new("<f:table xmlns:f=\"http://www.com/foo\">")
      assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f'])
    end

    def test_comment
      toke = HTMLTokenizer.new("<!-- comment on me -->")
      t = toke.getNextToken
      assert(HTMLComment == t.class)
      assert("comment on me" == t.contents)
    end


    def test_full
      page = "<HTML>
<HEAD>
<TITLE>This is the title</TITLE>
</HEAD>
<!-- Here comes the <a href=\"missing.link\">blah</a>
comment body
 -->
<BODY>
  <H1>This is the header</H1>
  <P>
    This is the paragraph, it contains
    <a href=\"link.html\">links</a>, 
    <img src=\"blah.gif\" optional alt='images
are
really cool'>.  Ok, here is some more text and
    <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
  </P>
</body>
</HTML>
"
      toke = HTMLTokenizer.new(page)

      assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
      assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
      assert("links" == toke.getTrimmedText)
      assert(toke.getTag("IMG", "A").attr_hash['optional'])
      assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
    end
  end
end
libhtml-htmltokenizer-ruby 1.0-3 / usr / lib / ruby / 1.8 / html / htmltokenizer.rb