Skip to content

refactor: replace bleach with nh3 (ammonia) #295

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ authors = [
]
readme = "README.rst"
license = {text = "Apache License, Version 2.0"}
dependencies = ["bleach>=2.1.0", "docutils>=0.13.1", "Pygments>=2.5.1"]
dependencies = ["nh3>=0.2.14", "docutils>=0.13.1", "Pygments>=2.5.1"]
classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
Expand Down
128 changes: 42 additions & 86 deletions readme_renderer/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
from typing import Any, Dict, Iterator, List, Optional
from typing import Dict, Optional, Set

import bleach
import bleach.callbacks
import bleach.linkifier
import bleach.sanitizer
import nh3


ALLOWED_TAGS = [
ALLOWED_TAGS = {
# Bleach Defaults
"a", "abbr", "acronym", "b", "blockquote", "code", "em", "i", "li", "ol",
"strong", "ul",
Expand All @@ -32,102 +28,62 @@
"span", "sub", "summary", "sup", "table", "tbody", "td", "th", "thead",
"tr", "tt", "kbd", "var", "input", "section", "aside", "nav", "s", "figure",
"figcaption",
]
}

ALLOWED_ATTRIBUTES = {
# Bleach Defaults
"a": ["href", "title"],
"abbr": ["title"],
"acronym": ["title"],
"a": {"href", "title"},
"abbr": {"title"},
"acronym": {"title"},

# Custom Additions
"*": ["id"],
"hr": ["class"],
"img": ["src", "width", "height", "alt", "align", "class"],
"span": ["class"],
"th": ["align", "class"],
"td": ["align", "colspan", "rowspan"],
"div": ["align", "class"],
"h1": ["align"],
"h2": ["align"],
"h3": ["align"],
"h4": ["align"],
"h5": ["align"],
"h6": ["align"],
"code": ["class"],
"p": ["align", "class"],
"pre": ["lang"],
"ol": ["start"],
"input": ["type", "checked", "disabled"],
"aside": ["class"],
"dd": ["class"],
"dl": ["class"],
"dt": ["class"],
"ul": ["class"],
"nav": ["class"],
"figure": ["class"],
"*": {"id"},
"hr": {"class"},
"img": {"src", "width", "height", "alt", "align", "class"},
"span": {"class"},
"th": {"align", "class"},
"td": {"align", "colspan", "rowspan"},
"div": {"align", "class"},
"h1": {"align"},
"h2": {"align"},
"h3": {"align"},
"h4": {"align"},
"h5": {"align"},
"h6": {"align"},
"code": {"class"},
"p": {"align", "class"},
"pre": {"lang"},
"ol": {"start"},
"input": {"type", "checked", "disabled"},
"aside": {"class"},
"dd": {"class"},
"dl": {"class"},
"dt": {"class"},
"ul": {"class"},
"nav": {"class"},
"figure": {"class"},
}


class DisabledCheckboxInputsFilter:
# The typeshed for bleach (html5lib) filters is incomplete, use `typing.Any`
# See https://github.com/python/typeshed/blob/505ea726415016e53638c8b584b8fdc9c722cac1/stubs/bleach/bleach/html5lib_shim.pyi#L7-L8 # noqa E501
def __init__(self, source: Any) -> None:
self.source = source

def __iter__(self) -> Iterator[Dict[str, Optional[str]]]:
for token in self.source:
if token.get("name") == "input":
# only allow disabled checkbox inputs
is_checkbox, is_disabled, unsafe_attrs = False, False, False
for (_, attrname), value in token.get("data", {}).items():
if attrname == "type" and value == "checkbox":
is_checkbox = True
elif attrname == "disabled":
is_disabled = True
elif attrname != "checked":
unsafe_attrs = True
break
if is_checkbox and is_disabled and not unsafe_attrs:
yield token
else:
yield token

def __getattr__(self, name: str) -> Any:
return getattr(self.source, name)


def clean(
html: str,
tags: Optional[List[str]] = None,
attributes: Optional[Dict[str, List[str]]] = None
tags: Optional[Set[str]] = None,
attributes: Optional[Dict[str, Set[str]]] = None
) -> Optional[str]:
if tags is None:
tags = ALLOWED_TAGS
if attributes is None:
attributes = ALLOWED_ATTRIBUTES

# Clean the output using Bleach
cleaner = bleach.sanitizer.Cleaner(
tags=tags,
attributes=attributes,
filters=[
# Bleach Linkify makes it easy to modify links, however, we will
# not be using it to create additional links.
functools.partial(
bleach.linkifier.LinkifyFilter,
callbacks=[
lambda attrs, new: attrs if not new else None,
bleach.callbacks.nofollow,
],
skip_tags=["pre"],
parse_email=False,
),
DisabledCheckboxInputsFilter,
],
)
try:
cleaned = cleaner.clean(html)
cleaned = nh3.clean(
html,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
link_rel="nofollow",
url_schemes={"http", "https", "mailto"},
)

return cleaned
except ValueError:
return None
2 changes: 1 addition & 1 deletion readme_renderer/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@

def render(raw: str, **kwargs: Any) -> Optional[str]:
rendered = html_escape(raw).replace("\n", "<br>")
return clean(rendered, tags=["br"])
return clean(rendered, tags={"br"})
2 changes: 1 addition & 1 deletion tests/fixtures/test_CommonMark_006.html
Original file line number Diff line number Diff line change
@@ -1 +1 @@
&lt;iframe src="http://mymalicioussite.com/"&gt;Click here&lt;/iframe&gt;
Click here
4 changes: 1 addition & 3 deletions tests/fixtures/test_CommonMark_007.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
<p>Something naughty this way comes</p>
&lt;script&gt;
alert("Hello");
&lt;/script&gt;

8 changes: 4 additions & 4 deletions tests/fixtures/test_CommonMark_008.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>

<span class="k">def</span> <span class="nf">make_sound</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Ruff!&#39;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">'Ruff!'</span><span class="p">)</span>

<span class="n">dog</span> <span class="o">=</span> <span class="n">Dog</span><span class="p">(</span><span class="s1">&#39;Fido&#39;</span><span class="p">)</span>
<span class="n">dog</span> <span class="o">=</span> <span class="n">Dog</span><span class="p">(</span><span class="s1">'Fido'</span><span class="p">)</span>
</pre>
<p>and then here is some bash:</p>
<pre lang="bash"><span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="s2">&quot;</span><span class="nv">$1</span><span class="s2">&quot;</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;--help&quot;</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;OK&quot;</span>
<pre lang="bash"><span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="s2">"</span><span class="nv">$1</span><span class="s2">"</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"--help"</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"OK"</span>
<span class="k">fi</span>
</pre>
<p>or click <a href="http://www.surveymonkey.com" rel="nofollow">SurveyMonkey</a></p>
2 changes: 1 addition & 1 deletion tests/fixtures/test_GFM_019.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
<p><a href="http://commonmark.org" rel="nofollow">http://commonmark.org</a></p>
<p>(Visit <a href="https://encrypted.google.com/search?q=Markup+(business)" rel="nofollow">https://encrypted.google.com/search?q=Markup+(business)</a>)</p>
<p>Anonymous FTP is available at <a>ftp://foo.bar.baz</a>.</p>
<p>Anonymous FTP is available at <a rel="nofollow">ftp://foo.bar.baz</a>.</p>
2 changes: 1 addition & 1 deletion tests/fixtures/test_GFM_020.html
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<p><a href="mailto:[email protected]">[email protected]</a></p>
<p><a href="mailto:[email protected]" rel="nofollow">[email protected]</a></p>
2 changes: 1 addition & 1 deletion tests/fixtures/test_GFM_021.html
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<p>hello@mail+xyz.example isn't valid, but <a href="mailto:[email protected]">[email protected]</a> is.</p>
<p>hello@mail+xyz.example isn't valid, but <a href="mailto:[email protected]" rel="nofollow">[email protected]</a> is.</p>
4 changes: 2 additions & 2 deletions tests/fixtures/test_GFM_022.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<p><a href="mailto:[email protected]">[email protected]</a></p>
<p><a href="mailto:[email protected]">[email protected]</a>.</p>
<p><a href="mailto:[email protected]" rel="nofollow">[email protected]</a></p>
<p><a href="mailto:[email protected]" rel="nofollow">[email protected]</a>.</p>
<p>[email protected]</p>
<p>[email protected]_</p>
14 changes: 7 additions & 7 deletions tests/fixtures/test_GFM_024.html
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
<ul>
<li><input type="checkbox" disabled> Valid unchecked checkbox</li>
<li><input type="checkbox" checked disabled> Valid checked checkbox</li>
<li> Invalid enabled checkbox</li>
<li><input type="checkbox" disabled=""> Valid unchecked checkbox</li>
<li><input type="checkbox" checked="" disabled=""> Valid checked checkbox</li>
<li><input type="checkbox"> Invalid enabled checkbox</li>
<li>

<input>
</li>
<li>

<input type="submit">
</li>
<li>

<input>
</li>
<li>

<input type="checkbox" checked="">
</li>
</ul>
2 changes: 1 addition & 1 deletion tests/fixtures/test_GFM_doublequotes.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<pre><code>This is code text.
</code></pre>
<pre lang="python3"><span class="k">def</span> <span class="nf">this_is_python</span><span class="p">():</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;This is a docstring.&quot;&quot;&quot;</span>
<span class="w"> </span><span class="sd">"""This is a docstring."""</span>
<span class="k">pass</span>
</pre>
<pre lang="go"><span class="kd">func</span><span class="w"> </span><span class="nx">ThisIsGo</span><span class="p">(){</span>
Expand Down
4 changes: 2 additions & 2 deletions tests/fixtures/test_GFM_malicious_pre.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<p>This is normal text.</p>
<pre lang="python3"><span class="k">def</span> <span class="nf">this_is_python</span><span class="p">():</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;This is a docstring.&quot;&quot;&quot;</span>
<span class="w"> </span><span class="sd">"""This is a docstring."""</span>
<span class="k">pass</span>
<span class="o">&lt;</span><span class="n">script</span> <span class="nb">type</span><span class="o">=</span><span class="s2">&quot;text/javascript&quot;</span><span class="o">&gt;</span><span class="n">alert</span><span class="p">(</span><span class="s1">&#39;I am evil.&#39;</span><span class="p">);</span><span class="o">&lt;/</span><span class="n">script</span><span class="o">&gt;</span>
<span class="o">&lt;</span><span class="n">script</span> <span class="nb">type</span><span class="o">=</span><span class="s2">"text/javascript"</span><span class="o">&gt;</span><span class="n">alert</span><span class="p">(</span><span class="s1">'I am evil.'</span><span class="p">);</span><span class="o">&lt;/</span><span class="n">script</span><span class="o">&gt;</span>
</pre>
4 changes: 2 additions & 2 deletions tests/fixtures/test_rst_008.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

</span><span class="n">dog</span> <span class="o">=</span> <span class="n">Dog</span><span class="p">(</span><span class="s1">'Fido'</span><span class="p">)</span></code></pre>
<p>and then here is some bash:</p>
<pre><code><span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="s2">&quot;</span><span class="nv">$1</span><span class="s2">&quot;</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;--help&quot;</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span><span class="w">
</span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;OK&quot;</span><span class="w">
<pre><code><span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="s2">"</span><span class="nv">$1</span><span class="s2">"</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"--help"</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span><span class="w">
</span><span class="nb">echo</span><span class="w"> </span><span class="s2">"OK"</span><span class="w">
</span><span class="k">fi</span></code></pre>
<p>or click <a href="http://www.surveymonkey.com" rel="nofollow">SurveyMonkey</a></p>
<pre><code>An unknown code fence block</code></pre>
2 changes: 1 addition & 1 deletion tests/fixtures/test_rst_bibtex.html
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
<pre><code><span class="nc">&#64;article</span><span class="p">{</span><span class="nl">the_impact_of_pygments_docutils_config_and_html5</span><span class="p">,</span><span class="w">
<pre><code><span class="nc">@article</span><span class="p">{</span><span class="nl">the_impact_of_pygments_docutils_config_and_html5</span><span class="p">,</span><span class="w">
</span><span class="na">year</span><span class="w"> </span><span class="p">=</span><span class="w"> </span><s>{2022}</s><span class="p">,</span></code></pre>
2 changes: 1 addition & 1 deletion tests/fixtures/test_rst_docinfo.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<dd class="created"><p>mer 02 ago 2017 14:49:24 CEST</p>
</dd>
<dt class="author">Author<span class="colon">:</span></dt>
<dd class="author"><p>Lele Gaifax &lt;<a href="mailto:lele&#37;&#52;&#48;metapensiero&#46;it">lele<span>&#64;</span>metapensiero<span>&#46;</span>it</a>&gt;</p></dd>
<dd class="author"><p>Lele Gaifax &lt;<a href="mailto:lele%40metapensiero.it" rel="nofollow">lele<span>@</span>metapensiero<span>.</span>it</a>&gt;</p></dd>
<dt class="license">License<span class="colon">:</span></dt>
<dd class="license"><p>GNU General Public License version 3 or later</p>
</dd>
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/test_rst_linkify.html
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ <h2>Development</h2>
<dd><p><a href="http://multigtfs.readthedocs.org/" rel="nofollow">http://multigtfs.readthedocs.org/</a></p>
</dd>
<dt>IRC<span class="colon">:</span></dt>
<dd><p><a>irc://irc.freenode.net/tulsawebdevs</a></p>
<dd><p><a rel="nofollow">irc://irc.freenode.net/tulsawebdevs</a></p>
</dd>
</dl>
</section>
Loading