|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 |
|
15 |
| -import functools |
16 |
| -from typing import Any, Dict, Iterator, List, Optional |
| 15 | +from typing import Dict, Optional, Set |
17 | 16 |
|
18 |
| -import bleach |
19 |
| -import bleach.callbacks |
20 |
| -import bleach.linkifier |
21 |
| -import bleach.sanitizer |
| 17 | +import nh3 |
22 | 18 |
|
23 | 19 |
|
24 |
| -ALLOWED_TAGS = [ |
| 20 | +ALLOWED_TAGS = { |
25 | 21 | # Bleach Defaults
|
26 | 22 | "a", "abbr", "acronym", "b", "blockquote", "code", "em", "i", "li", "ol",
|
27 | 23 | "strong", "ul",
|
|
32 | 28 | "span", "sub", "summary", "sup", "table", "tbody", "td", "th", "thead",
|
33 | 29 | "tr", "tt", "kbd", "var", "input", "section", "aside", "nav", "s", "figure",
|
34 | 30 | "figcaption",
|
35 |
| -] |
| 31 | +} |
36 | 32 |
|
37 | 33 | ALLOWED_ATTRIBUTES = {
|
38 | 34 | # Bleach Defaults
|
39 |
| - "a": ["href", "title"], |
40 |
| - "abbr": ["title"], |
41 |
| - "acronym": ["title"], |
| 35 | + "a": {"href", "title"}, |
| 36 | + "abbr": {"title"}, |
| 37 | + "acronym": {"title"}, |
42 | 38 |
|
43 | 39 | # Custom Additions
|
44 |
| - "*": ["id"], |
45 |
| - "hr": ["class"], |
46 |
| - "img": ["src", "width", "height", "alt", "align", "class"], |
47 |
| - "span": ["class"], |
48 |
| - "th": ["align", "class"], |
49 |
| - "td": ["align", "colspan", "rowspan"], |
50 |
| - "div": ["align", "class"], |
51 |
| - "h1": ["align"], |
52 |
| - "h2": ["align"], |
53 |
| - "h3": ["align"], |
54 |
| - "h4": ["align"], |
55 |
| - "h5": ["align"], |
56 |
| - "h6": ["align"], |
57 |
| - "code": ["class"], |
58 |
| - "p": ["align", "class"], |
59 |
| - "pre": ["lang"], |
60 |
| - "ol": ["start"], |
61 |
| - "input": ["type", "checked", "disabled"], |
62 |
| - "aside": ["class"], |
63 |
| - "dd": ["class"], |
64 |
| - "dl": ["class"], |
65 |
| - "dt": ["class"], |
66 |
| - "ul": ["class"], |
67 |
| - "nav": ["class"], |
68 |
| - "figure": ["class"], |
| 40 | + "*": {"id"}, |
| 41 | + "hr": {"class"}, |
| 42 | + "img": {"src", "width", "height", "alt", "align", "class"}, |
| 43 | + "span": {"class"}, |
| 44 | + "th": {"align", "class"}, |
| 45 | + "td": {"align", "colspan", "rowspan"}, |
| 46 | + "div": {"align", "class"}, |
| 47 | + "h1": {"align"}, |
| 48 | + "h2": {"align"}, |
| 49 | + "h3": {"align"}, |
| 50 | + "h4": {"align"}, |
| 51 | + "h5": {"align"}, |
| 52 | + "h6": {"align"}, |
| 53 | + "code": {"class"}, |
| 54 | + "p": {"align", "class"}, |
| 55 | + "pre": {"lang"}, |
| 56 | + "ol": {"start"}, |
| 57 | + "input": {"type", "checked", "disabled"}, |
| 58 | + "aside": {"class"}, |
| 59 | + "dd": {"class"}, |
| 60 | + "dl": {"class"}, |
| 61 | + "dt": {"class"}, |
| 62 | + "ul": {"class"}, |
| 63 | + "nav": {"class"}, |
| 64 | + "figure": {"class"}, |
69 | 65 | }
|
70 | 66 |
|
71 | 67 |
|
72 |
| -class DisabledCheckboxInputsFilter: |
73 |
| - # The typeshed for bleach (html5lib) filters is incomplete, use `typing.Any` |
74 |
| - # See https://github.com/python/typeshed/blob/505ea726415016e53638c8b584b8fdc9c722cac1/stubs/bleach/bleach/html5lib_shim.pyi#L7-L8 # noqa E501 |
75 |
| - def __init__(self, source: Any) -> None: |
76 |
| - self.source = source |
77 |
| - |
78 |
| - def __iter__(self) -> Iterator[Dict[str, Optional[str]]]: |
79 |
| - for token in self.source: |
80 |
| - if token.get("name") == "input": |
81 |
| - # only allow disabled checkbox inputs |
82 |
| - is_checkbox, is_disabled, unsafe_attrs = False, False, False |
83 |
| - for (_, attrname), value in token.get("data", {}).items(): |
84 |
| - if attrname == "type" and value == "checkbox": |
85 |
| - is_checkbox = True |
86 |
| - elif attrname == "disabled": |
87 |
| - is_disabled = True |
88 |
| - elif attrname != "checked": |
89 |
| - unsafe_attrs = True |
90 |
| - break |
91 |
| - if is_checkbox and is_disabled and not unsafe_attrs: |
92 |
| - yield token |
93 |
| - else: |
94 |
| - yield token |
95 |
| - |
96 |
| - def __getattr__(self, name: str) -> Any: |
97 |
| - return getattr(self.source, name) |
98 |
| - |
99 |
| - |
100 | 68 | def clean(
|
101 | 69 | html: str,
|
102 |
| - tags: Optional[List[str]] = None, |
103 |
| - attributes: Optional[Dict[str, List[str]]] = None |
| 70 | + tags: Optional[Set[str]] = None, |
| 71 | + attributes: Optional[Dict[str, Set[str]]] = None |
104 | 72 | ) -> Optional[str]:
|
105 | 73 | if tags is None:
|
106 | 74 | tags = ALLOWED_TAGS
|
107 | 75 | if attributes is None:
|
108 | 76 | attributes = ALLOWED_ATTRIBUTES
|
109 | 77 |
|
110 |
| - # Clean the output using Bleach |
111 |
| - cleaner = bleach.sanitizer.Cleaner( |
112 |
| - tags=tags, |
113 |
| - attributes=attributes, |
114 |
| - filters=[ |
115 |
| - # Bleach Linkify makes it easy to modify links, however, we will |
116 |
| - # not be using it to create additional links. |
117 |
| - functools.partial( |
118 |
| - bleach.linkifier.LinkifyFilter, |
119 |
| - callbacks=[ |
120 |
| - lambda attrs, new: attrs if not new else None, |
121 |
| - bleach.callbacks.nofollow, |
122 |
| - ], |
123 |
| - skip_tags=["pre"], |
124 |
| - parse_email=False, |
125 |
| - ), |
126 |
| - DisabledCheckboxInputsFilter, |
127 |
| - ], |
128 |
| - ) |
129 | 78 | try:
|
130 |
| - cleaned = cleaner.clean(html) |
| 79 | + cleaned = nh3.clean( |
| 80 | + html, |
| 81 | + tags=ALLOWED_TAGS, |
| 82 | + attributes=ALLOWED_ATTRIBUTES, |
| 83 | + link_rel="nofollow", |
| 84 | + url_schemes={"http", "https", "mailto"}, |
| 85 | + ) |
| 86 | + |
131 | 87 | return cleaned
|
132 | 88 | except ValueError:
|
133 | 89 | return None
|
0 commit comments