Use tidy binary when available instead of the DL binding of libtidy.so (the latter is broken by DL security fixes in Ruby 1.8.7-p72, see http://bugs.debian.org/500461). diff --exclude=debian -urN samizdat-0.6.1/lib/samizdat/models/content.rb samizdat-0.6.1-tidy/lib/samizdat/models/content.rb --- samizdat-0.6.1/lib/samizdat/models/content.rb 2009-02-02 16:15:15.000000000 +0200 +++ samizdat-0.6.1-tidy/lib/samizdat/models/content.rb 2009-02-02 16:22:10.000000000 +0200 @@ -238,7 +238,11 @@ } + "

\n" }.join end - Samizdat::Sanitize.new(config.xhtml).sanitize(html) + begin + Samizdat::Sanitize.new(config.xhtml).sanitize(html) + rescue Samizdat::SanitizeError => e + raise UserError, CGI.escapeHTML(e.message).untaint + end end private diff --exclude=debian -urN samizdat-0.6.1/lib/samizdat/sanitize.rb samizdat-0.6.1-tidy/lib/samizdat/sanitize.rb --- samizdat-0.6.1/lib/samizdat/sanitize.rb 2009-02-02 16:15:15.000000000 +0200 +++ samizdat-0.6.1-tidy/lib/samizdat/sanitize.rb 2009-02-02 16:22:30.000000000 +0200 @@ -8,12 +8,8 @@ # # vim: et sw=2 sts=2 ts=8 tw=0 -require 'cgi' -require 'yaml' require 'rexml/document' -require 'tidy' - # use (") instead of (') in XML attributes, escape both of them # module REXML @@ -29,14 +25,13 @@ module Samizdat -class Sanitize - TIDY_PATH = '/usr/lib/libtidy.so' +class SanitizeError < RuntimeError; end +class Sanitize begin - FORMATTER = REXML::Formatters::Default.new(true) # enable IE hack - rescue LoadError + rescue LoadError, NameError # backwards compatibility for Ruby versions without REXML::Formatters # @@ -50,13 +45,15 @@ FORMATTER = LegacyFormatter.new end - def initialize(xhtml, tidypath=TIDY_PATH) + # _xhtml_ is expected to be loaded from xhtml.yaml. + # + # _tidypath_ may point to a binary or library. If it's a library (detected by + # .so in the file name), Ruby/Tidy DL-based wrapper library will be used. If + # it's a binary, pipe will be used to filter HTML through it. + # + def initialize(xhtml, tidypath=nil) @xhtml = xhtml - - # workaround for memory leak in Tidy.path= - if not defined?(@@tidypath) or tidypath != @@tidypath - Tidy.path = @@tidypath = tidypath - end + set_tidy(tidypath) end attr_reader :xhtml @@ -103,35 +100,113 @@ # filter HTML through Tidy # def tidy(html) - xml = Tidy.open(:output_xhtml => true, :literal_attributes => true, - :tidy_mark => false, :wrap => 0, :char_encoding => 'utf8' - ) {|tidy| tidy.clean(html.to_s.untaint) } - - xml.taint + @tidy_binary ? tidy_pipe(html) : tidy_dl(html) end # return sanitized HTML # - def sanitize(html, fragment=true, filter=@xhtml) + def sanitize(html, filter=@xhtml) + html = tidy(html) + (html.nil? or html.empty?) and raise SanitizeError, + "Invalid HTML detected" + begin - xml = REXML::Document.new(tidy(html)).root - xml = xml.elements['//html/body'] if fragment # work around tidy + xml = REXML::Document.new(html).root + xml = xml.elements['//html/body'] rescue REXML::ParseException - raise RuntimeError, "Invalid HTML detected: " + - CGI.escapeHTML($!.continued_exception.to_s.gsub!(/\n.*/, '')) + raise SanitizeError, "Invalid XHTML detected: " + + $!.continued_exception.to_s.gsub(/\n.*/, '') end sanitize_element(xml, filter) html = '' - if fragment - xml.each {|child| FORMATTER.write(child, html) } - else - FORMATTER.write(xml, html) - end + xml.each {|child| FORMATTER.write(child, html) } html end + + private + + SO_PATH_PATTERN = Regexp.new(/\.so(?:\..+)?\z/).freeze + + def is_so?(path) + path =~ SO_PATH_PATTERN and File.readable?(path) + end + + def set_tidy(tidypath) + if tidypath.nil? + [ '/usr/bin/tidy', + '/usr/local/bin/tidy', + '/usr/lib/libtidy.so', + '/usr/local/lib/libtidy.so' + ].each {|path| + if File.exists?(path) + tidypath = path + break + end + } + end + + if is_so?(tidypath) + require 'tidy' + + # workaround for memory leak in Tidy.path= + if not defined?(@@tidysopath) or tidypath != @@tidysopath + Tidy.path = @@tidysopath = tidypath + end + + @tidy_binary = nil + + elsif File.executable?(tidypath) + @tidy_binary = tidypath + end + + require 'open3' if @tidy_binary + end + + def tidy_dl(html) + xml = Tidy.open(:quiet => true, + :show_warnings => false, + :show_errors => 1, + :output_xhtml => true, + :literal_attributes => true, + :preserve_entities => true, + :tidy_mark => false, + :wrap => 0, + :char_encoding => 'utf8' + ) {|tidy| tidy.clean(html.to_s.untaint) } + + xml.taint + end + + def tidy_pipe(html) + stdin, stdout, stderr = + Open3.popen3(@tidy_binary + + ' --quiet yes' + + ' --show-warnings no' + + ' --show-errors 1' + + ' --output-xhtml yes' + + ' --literal-attributes yes' + + ' --preserve-entities yes' + + ' --tidy-mark no' + + ' --wrap 0' + + ' --char-encoding utf8') + + stdin.write(html.to_s.untaint) + stdin.close + + errors = stderr.read + stderr.close + + xhtml = stdout.read + stdout.close + + errors.nil? or errors.empty? or raise SanitizeError, + "Invalid HTML detected: " + errors + + xhtml + end end end # module Samizdat