This file is indexed.

/usr/lib/ruby/1.8/samizdat/sanitize.rb is in libsamizdat-ruby1.8 0.6.2-2ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# Samizdat HTML validation
#
#   Copyright (c) 2002-2009  Dmitry Borodaenko <angdraug@debian.org>
#
#   This program is free software.
#   You can distribute/modify this program under the terms of
#   the GNU General Public License version 3 or later.
#
# vim: et sw=2 sts=2 ts=8 tw=0

require 'rexml/document'

# use (") instead of (') in XML attributes, escape both of them
#
module REXML

class Attribute
  def to_string
    %{#@expanded_name="#{to_s().gsub(/"/, '&quot;').gsub(/'/, '&apos;')}"}
  end
end

end   # module REXML


module Samizdat

class SanitizeError < RuntimeError; end

class Sanitize
  begin
    FORMATTER = REXML::Formatters::Default.new(true)   # enable IE hack

  rescue LoadError, NameError

    # backwards compatibility for Ruby versions without REXML::Formatters
    #
    class LegacyFormatter
      def write(node, output)
        return unless node.respond_to?(:write)
        node.write(output, -1, false, true)
      end
    end

    FORMATTER = LegacyFormatter.new
  end

  # _xhtml_ is expected to be loaded from xhtml.yaml.
  #
  # _tidypath_ may point to a binary or library. If it's a library (detected by
  # .so in the file name), Ruby/Tidy DL-based wrapper library will be used. If
  # it's a binary, pipe will be used to filter HTML through it.
  #
  def initialize(xhtml, tidypath=nil)
    @xhtml = xhtml
    set_tidy(tidypath)
  end

  attr_reader :xhtml

  CSS = Regexp.new(%r{
    \A\s*
    ([-a-z0-9]+) : \s*
    (?: (?: [-./a-z0-9]+ | \#[0-9a-f]+ | [0-9]+% ) \s* ) +
    \s*\z
  }xi).freeze

  def check_style(css, style)
    style.split(';').each do |s|
      return false unless
        s =~ CSS and css.include? $1
    end
    true
  end

  # compare elements and attributes with xhtml.yaml
  #
  def sanitize_element(xml, filter=@xhtml)
    if xml.name =~ /^_/ or not filter.keys.include?(xml.name)
      # doesn't work without xpath
      xml.document.delete_element(xml.xpath)
      return
    end
    if xml.has_attributes?
      attrs = filter['_common'].merge((filter[xml.name] or {}))
      xml.attributes.each_attribute do |a|
        xml.delete_attribute(a.name) unless attrs[a.name] === a.to_s
        if 'style' == a.name and filter['_css']
          # sanitize CSS in style="" attributes
          xml.delete_attribute(a.name) unless
            check_style(filter['_css'], a.value)
        end
      end
    end
    if xml.has_elements?   # recurse
      xml.elements.each {|e| sanitize_element(e) }
    end
  end

  # filter HTML through Tidy
  #
  def tidy(html)
    @tidy_binary ? tidy_pipe(html) : tidy_dl(html)
  end

  # return sanitized HTML
  #
  def sanitize(html, filter=@xhtml)
    html = tidy(html)
    (html.nil? or html.empty?) and raise SanitizeError,
      "Invalid HTML detected"

    begin
      xml = REXML::Document.new(html).root
      xml = xml.elements['//html/body']
    rescue REXML::ParseException
      raise SanitizeError, "Invalid XHTML detected: " +
        $!.continued_exception.to_s.gsub(/\n.*/, '')
    end

    sanitize_element(xml, filter)

    html = ''
    xml.each {|child| FORMATTER.write(child, html) }

    html
  end

  private

  SO_PATH_PATTERN = Regexp.new(/\.so(?:\..+)?\z/).freeze

  def is_so?(path)
    path =~ SO_PATH_PATTERN and File.readable?(path)
  end

  def set_tidy(tidypath)
    if tidypath.nil?
      [ '/usr/bin/tidy',
        '/usr/local/bin/tidy',
        '/usr/lib/libtidy.so',
        '/usr/local/lib/libtidy.so'
      ].each {|path|
        if File.exists?(path)
          tidypath = path
          break
        end
      }
    end

    if is_so?(tidypath)
      require 'tidy'

      # workaround for memory leak in Tidy.path=
      if not defined?(@@tidysopath) or tidypath != @@tidysopath
        Tidy.path = @@tidysopath = tidypath
      end

      @tidy_binary = nil

    elsif File.executable?(tidypath)
      @tidy_binary = tidypath
    end

    require 'open3' if @tidy_binary
  end

  def tidy_dl(html)
    xml = Tidy.open(:quiet => true,
                    :show_warnings => false,
                    :show_errors => 1,
                    :output_xhtml => true,
                    :literal_attributes => true,
                    :preserve_entities => true,
                    :tidy_mark => false,
                    :wrap => 0,
                    :char_encoding => 'utf8'
    ) {|tidy| tidy.clean(html.to_s.untaint) }

    xml.taint
  end

  def tidy_pipe(html)
    stdin, stdout, stderr =
      Open3.popen3(@tidy_binary +
                   ' --quiet yes' +
                   ' --show-warnings no' +
                   ' --show-errors 1' +
                   ' --output-xhtml yes' +
                   ' --literal-attributes yes' +
                   ' --preserve-entities yes' +
                   ' --tidy-mark no' +
                   ' --wrap 0' +
                   ' --char-encoding utf8')

    stdin.write(html.to_s.untaint)
    stdin.close

    errors = stderr.read
    stderr.close

    xhtml = stdout.read
    stdout.close

    errors.nil? or errors.empty? or raise SanitizeError,
      "Invalid HTML detected: " + errors

    xhtml
  end
end

end   # module Samizdat