fix: replace recursive approach to cdata with escaping solution

flavorjones · Dec 11, 2022 · 86f7f63 · 86f7f63
1 parent 415677f
commit 86f7f63
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 9 deletions.
diff --git a/lib/loofah/html5/scrub.rb b/lib/loofah/html5/scrub.rb
@@ -182,6 +182,46 @@ def force_correct_attribute_escaping!(node)
  end.force_encoding(encoding)
  end
  end
+
+ def cdata_needs_escaping?(node)
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
+ node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
+ end
+
+ def cdata_escape(node)
+ escaped_text = escape_tags(node.text)
+ if Nokogiri.jruby?
+ node.document.create_text_node(escaped_text)
+ else
+ node.document.create_cdata(escaped_text)
+ end
+ end
+
+ TABLE_FOR_ESCAPE_HTML__ = {
+ '<' => '&lt;',
+ '>' => '&gt;',
+ '&' => '&amp;',
+ }
+
+ def escape_tags(string)
+ # modified version of CGI.escapeHTML from ruby 3.1
+ enc = string.encoding
+ unless enc.ascii_compatible?
+ if enc.dummy?
+ origenc = enc
+ enc = Encoding::Converter.asciicompat_encoding(enc)
+ string = enc ? string.encode(enc) : string.b
+ end
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
+ string.encode!(origenc) if origenc
+ string
+ else
+ string = string.b
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+ string.force_encoding(enc)
+ end
+ end
  end
  end
  end

diff --git a/lib/loofah/scrubber.rb b/lib/loofah/scrubber.rb
@@ -108,6 +108,10 @@ def html5lib_sanitize(node)
  return Scrubber::CONTINUE
  end
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+ if HTML5::Scrub.cdata_needs_escaping?(node)
+ node.before(HTML5::Scrub.cdata_escape(node))
+ return Scrubber::STOP
+ end
  return Scrubber::CONTINUE
  end
  Scrubber::STOP

diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb
@@ -100,13 +100,9 @@ def initialize
 
  def scrub(node)
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
- if node.children.length == 1 && node.children.first.cdata?
- sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
- node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
- else
- node.before node.children
- end
+ node.before(node.children)
  node.remove
+ return STOP
  end
  end
 

diff --git a/test/integration/test_ad_hoc.rb b/test/integration/test_ad_hoc.rb
@@ -100,17 +100,28 @@ def test_return_empty_string_when_nothing_left
  end
 
  def test_nested_script_cdata_tags_should_be_scrubbed
- html = "<script><script src='malicious.js'></script>"
+ html = "<script><script src=\"malicious.js\">this & that</script>"
  stripped = Loofah.fragment(html).scrub!(:strip)
+
  assert_empty stripped.xpath("//script")
- refute_match("<script", stripped.to_html)
+ assert_equal("&lt;script src=\"malicious.js\"&gt;this &amp; that", stripped.to_html)
  end
 
  def test_nested_script_cdata_tags_should_be_scrubbed_2
  html = "<script><script>alert('a');</script></script>"
  stripped = Loofah.fragment(html).scrub!(:strip)
+
  assert_empty stripped.xpath("//script")
- refute_match("<script", stripped.to_html)
+ assert_equal("&lt;script&gt;alert('a');", stripped.to_html)
+ end
+
+ def test_nested_script_cdata_tags_should_be_scrubbed_max_recursion
+ n = 40
+ html = "<div>" + ("<script>" * n) + "alert(1);" + ("</script>" * n) + "</div>"
+ expected = "<div>" + ("&lt;script&gt;" * (n-1)) + "alert(1);</div>"
+ actual = Loofah.fragment(html).scrub!(:strip).to_html
+
+ assert_equal(expected, actual)
  end
 
  def test_removal_of_all_tags