Merge pull request #432 from willkg/431-charencoding

willkg · web-flow · commit cabd665db0b0 · 2019-01-08T12:00:15.000-05:00
Fix parsing "meta" tag with encoding attribute
diff --git a/CHANGES b/CHANGES
@@ -1,7 +1,7 @@
 Bleach changes
 ==============
 
-Version 3.0.3 (In development)
+Version 3.1.0 (In development)
 ------------------------------
 
 **Security fixes**
@@ -25,6 +25,12 @@ None
 * Fix cases where attribute names could have invalid characters in them.
   (#419)
 
+* Fix problems with ``LinkifyFilter`` not being able to match links
+  across ``&amp;``. (#422)
+
+* Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is
+  parsing ``meta`` tags. (#431)
+
 
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
@@ -20,7 +20,7 @@
 # yyyymmdd
 __releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.3.dev0'
+__version__ = '3.1.0.dev0'
 VERSION = parse_version(__version__)
 
 
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
@@ -181,6 +181,14 @@ def __init__(self, inner_stream):
     def errors(self):
         return self._inner_stream.errors
 
+    @property
+    def charEncoding(self):
+        return self._inner_stream.charEncoding
+
+    @property
+    def changeEncoding(self):
+        return self._inner_stream.changeEncoding
+
     def char(self):
         c = self._inner_stream.char()
         # char() can return None if EOF, so ignore that
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
@@ -80,3 +80,65 @@ def test_serializer(data, expected):
     serialized = serializer.render(walker(dom))
 
     assert serialized == expected
+
+
+@pytest.mark.parametrize('parser_args, data, expected', [
+    # Make sure InputStreamWithMemory has charEncoding and changeEncoding
+    (
+        {},
+        '<meta charset="utf-8">',
+        '<meta charset="utf-8">'
+    ),
+    # Handle consume entities False--all entities are passed along and then
+    # escaped when serialized
+    (
+        {'consume_entities': False},
+        'text &amp;&gt;&quot;',
+        'text &amp;amp;&amp;gt;&amp;quot;'
+    ),
+    # Handle consume entities True--all entities are consumed and converted
+    # to their character equivalents and then &, <, and > are escaped when
+    # serialized
+    (
+        {'consume_entities': True},
+        'text &amp;&gt;&quot;',
+        'text &amp;&gt;"'
+    ),
+    # Test that "invalid-character-in-attribute-name" errors in tokenizing
+    # result in attributes with invalid names getting dropped
+    (
+        {},
+        '<a href="http://example.com"">',
+        '<a href="http://example.com"></a>'
+    ),
+    (
+        {},
+        '<a href=\'http://example.com\'\'>',
+        '<a href="http://example.com"></a>'
+    )
+])
+def test_bleach_html_parser(parser_args, data, expected):
+    args = {
+        'tags': None,
+        'strip': True,
+        'consume_entities': True
+    }
+    args.update(parser_args)
+
+    # Build a parser, walker, and serializer just like we do in clean()
+    parser = html5lib_shim.BleachHTMLParser(**args)
+    walker = html5lib_shim.getTreeWalker('etree')
+    serializer = html5lib_shim.BleachHTMLSerializer(
+        quote_attr_values='always',
+        omit_optional_tags=False,
+        escape_lt_in_attrs=True,
+        resolve_entities=False,
+        sanitize=False,
+        alphabetical_attributes=False,
+    )
+
+    # Parse, walk, and then serialize the output
+    dom = parser.parseFragment(data)
+    serialized = serializer.render(walker(dom))
+
+    assert serialized == expected
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
@@ -69,17 +69,6 @@ def ft(attrs, new=False):
     )
 
 
-def test_invalid_attribute_names():
-    """Test that "invalid-character-in-attribute-name" errors in tokenizing
-    result in attributes with invalid names get dropped.
-
-    """
-    assert (
-        linkify('<a href="http://example.com/"">') ==
-        '<a href="http://example.com/" rel="nofollow"></a>'
-    )
-
-
 @pytest.mark.parametrize('data,parse_email,expected', [
     (
         'a james@example.com mailto',