Skip to content

Commit a209500

Browse files
sobolevnAlexWaygoodAvasam
authored
Complete stubs for bleach (#9314)
Co-authored-by: Alex Waygood <[email protected]> Co-authored-by: Avasam <[email protected]>
1 parent 78b7dc6 commit a209500

9 files changed

+128
-66
lines changed

pyrightconfig.stricter.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
"stubs/antlr4-python3-runtime",
2828
"stubs/aws-xray-sdk",
2929
"stubs/beautifulsoup4",
30-
"stubs/bleach",
3130
"stubs/boltons",
3231
"stubs/boto",
3332
"stubs/braintree",
Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
1-
bleach.css_sanitizer # Requires tinycss2 to be installed
2-
bleach.html5lib_shim.*
1+
# Internal private stuff:
2+
bleach._vendor.*
3+
4+
# Is a property returning a method, simplified:
5+
bleach.html5lib_shim.InputStreamWithMemory.changeEncoding

stubs/bleach/METADATA.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
version = "6.1.*"
2+
requires = ["types-html5lib"]
23
upstream_repository = "https://github.com/mozilla/bleach"
3-
partial_stub = true
44

55
[tool.stubtest]
6-
ignore_missing_stub = true
6+
extras = ["css"]

stubs/bleach/bleach/css_sanitizer.pyi

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from collections.abc import Container
2+
from typing import Final
23

3-
ALLOWED_CSS_PROPERTIES: frozenset[str]
4-
ALLOWED_SVG_PROPERTIES: frozenset[str]
4+
ALLOWED_CSS_PROPERTIES: Final[frozenset[str]]
5+
ALLOWED_SVG_PROPERTIES: Final[frozenset[str]]
56

67
class CSSSanitizer:
78
allowed_css_properties: Container[str]

stubs/bleach/bleach/html5lib_shim.pyi

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,70 @@
1-
from _typeshed import Incomplete
1+
import re
2+
from codecs import CodecInfo
23
from collections.abc import Generator, Iterable, Iterator
4+
from typing import Any, Final, Protocol
35

4-
class HTMLParser: # actually html5lib.HTMLParser
5-
def __getattr__(self, __name: str) -> Incomplete: ...
6+
# We don't re-export any `html5lib` types / values here, because they are not
7+
# really public and may change at any time. This is just a helper module,
8+
# import things directly from `html5lib` instead!
9+
from html5lib import HTMLParser
10+
from html5lib._inputstream import HTMLBinaryInputStream, HTMLUnicodeInputStream
11+
from html5lib._tokenizer import HTMLTokenizer
12+
from html5lib._trie import Trie
13+
from html5lib.serializer import HTMLSerializer
14+
from html5lib.treewalkers.base import TreeWalker
615

7-
class Filter: # actually html5lib.filters.base.Filter
8-
source: Incomplete
9-
def __init__(self, source) -> None: ...
10-
def __iter__(self) -> Iterator[Incomplete]: ...
11-
def __getattr__(self, name: str) -> Incomplete: ... # copy attributes from source
16+
# Is actually webencodings.Encoding
17+
class _Encoding(Protocol):
18+
name: str
19+
codec_info: CodecInfo
20+
def __init__(self, name: str, codec_info: CodecInfo) -> None: ...
1221

13-
class SanitizerFilter: # actually html5lib.filters.sanitizer.Filter
14-
def __getattr__(self, __name: str) -> Incomplete: ...
22+
HTML_TAGS: Final[frozenset[str]]
23+
HTML_TAGS_BLOCK_LEVEL: Final[frozenset[str]]
24+
AMP_SPLIT_RE: Final[re.Pattern[str]]
25+
ENTITIES: Final[dict[str, str]]
26+
ENTITIES_TRIE: Final[Trie]
27+
TAG_TOKEN_TYPES: Final[set[int]]
28+
TAG_TOKEN_TYPE_CHARACTERS: Final[int]
29+
TAG_TOKEN_TYPE_END: Final[int]
30+
TAG_TOKEN_TYPE_PARSEERROR: Final[int]
31+
TAG_TOKEN_TYPE_START: Final[int]
1532

16-
class HTMLSerializer: # actually html5lib.serializer.HTMLSerializer
17-
def __getattr__(self, __name: str) -> Incomplete: ...
33+
class InputStreamWithMemory:
34+
position = HTMLUnicodeInputStream.position
35+
reset = HTMLUnicodeInputStream.reset
36+
def __init__(self, inner_stream: HTMLUnicodeInputStream) -> None: ...
37+
@property
38+
def errors(self) -> list[str]: ...
39+
@property
40+
def charEncoding(self) -> tuple[_Encoding, str]: ...
41+
# If inner_stream wasn't a HTMLBinaryInputStream, this will error at runtime
42+
# Is a property returning a method, simplified:
43+
changeEncoding = HTMLBinaryInputStream.changeEncoding
44+
def char(self) -> str: ...
45+
def charsUntil(self, characters: Iterable[str], opposite: bool = False) -> str: ...
46+
def unget(self, char: str | None) -> None: ...
47+
def get_tag(self) -> str: ...
48+
def start_tag(self) -> None: ...
49+
50+
class BleachHTMLTokenizer(HTMLTokenizer):
51+
consume_entities: bool
52+
stream: InputStreamWithMemory
53+
emitted_last_token: dict[str, Any] | None
54+
def __init__(self, consume_entities: bool = False, **kwargs: Any) -> None: ...
1855

1956
class BleachHTMLParser(HTMLParser):
2057
tags: list[str] | None
2158
strip: bool
2259
consume_entities: bool
23-
def __init__(self, tags: Iterable[str] | None, strip: bool, consume_entities: bool, **kwargs) -> None: ...
60+
def __init__(self, tags: Iterable[str] | None, strip: bool, consume_entities: bool, **kwargs: Any) -> None: ...
2461

2562
class BleachHTMLSerializer(HTMLSerializer):
2663
escape_rcdata: bool
2764
def escape_base_amp(self, stoken: str) -> Generator[str, None, None]: ...
28-
def serialize(self, treewalker, encoding: str | None = None) -> Generator[str, None, None]: ...
65+
def serialize(self, treewalker: TreeWalker, encoding: str | None = None) -> Generator[str, None, None]: ... # type: ignore[override]
2966

30-
def __getattr__(__name: str) -> Incomplete: ...
67+
def convert_entity(value: str) -> str | None: ...
68+
def convert_entities(text: str) -> str: ...
69+
def match_entity(stream: str) -> str | None: ...
70+
def next_possible_entity(text: str) -> Iterator[str]: ...

stubs/bleach/bleach/linkifier.pyi

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
from _typeshed import Incomplete
2-
from collections.abc import Container, Iterable, Iterator
2+
from collections.abc import Container, Iterable, Iterator, Sequence
33
from re import Pattern
4+
from typing import Any, Final
5+
from typing_extensions import TypeAlias
46

5-
from .callbacks import _Callback
6-
from .html5lib_shim import Filter
7+
from html5lib.filters.base import Filter
8+
from html5lib.treewalkers.base import TreeWalker
79

8-
DEFAULT_CALLBACKS: list[_Callback]
10+
from .callbacks import _Callback, _HTMLAttrs
911

10-
TLDS: list[str]
12+
DEFAULT_CALLBACKS: Final[list[_Callback]]
13+
TLDS: Final[list[str]]
1114

1215
def build_url_re(tlds: Iterable[str] = ..., protocols: Iterable[str] = ...) -> Pattern[str]: ...
1316

14-
URL_RE: Pattern[str]
15-
PROTO_RE: Pattern[str]
17+
URL_RE: Final[Pattern[str]]
18+
PROTO_RE: Final[Pattern[str]]
1619

1720
def build_email_re(tlds: Iterable[str] = ...) -> Pattern[str]: ...
1821

19-
EMAIL_RE: Pattern[str]
22+
EMAIL_RE: Final[Pattern[str]]
2023

2124
class Linker:
2225
def __init__(
@@ -30,6 +33,10 @@ class Linker:
3033
) -> None: ...
3134
def linkify(self, text: str) -> str: ...
3235

36+
# TODO: `_Token` might be converted into `TypedDict`
37+
# or `html5lib` token might be reused
38+
_Token: TypeAlias = dict[str, Any]
39+
3340
class LinkifyFilter(Filter):
3441
callbacks: Iterable[_Callback]
3542
skip_tags: Container[str]
@@ -38,18 +45,18 @@ class LinkifyFilter(Filter):
3845
email_re: Pattern[str]
3946
def __init__(
4047
self,
41-
source,
48+
source: TreeWalker,
4249
callbacks: Iterable[_Callback] | None = ...,
4350
skip_tags: Container[str] | None = None,
4451
parse_email: bool = False,
4552
url_re: Pattern[str] = ...,
4653
email_re: Pattern[str] = ...,
4754
) -> None: ...
48-
def apply_callbacks(self, attrs, is_new): ...
49-
def extract_character_data(self, token_list): ...
50-
def handle_email_addresses(self, src_iter): ...
51-
def strip_non_url_bits(self, fragment): ...
52-
def handle_links(self, src_iter): ...
53-
def handle_a_tag(self, token_buffer): ...
54-
def extract_entities(self, token): ...
55+
def apply_callbacks(self, attrs: _HTMLAttrs, is_new: bool) -> _HTMLAttrs | None: ...
56+
def extract_character_data(self, token_list: Iterable[_Token]) -> str: ...
57+
def handle_email_addresses(self, src_iter: Iterable[_Token]) -> Iterator[_Token]: ...
58+
def strip_non_url_bits(self, fragment: str) -> tuple[str, str, str]: ...
59+
def handle_links(self, src_iter: Iterable[_Token]) -> Iterator[_Token]: ...
60+
def handle_a_tag(self, token_buffer: Sequence[_Token]) -> Iterator[_Token]: ...
61+
def extract_entities(self, token: _Token) -> Iterator[_Token]: ...
5562
def __iter__(self) -> Iterator[Incomplete]: ...

stubs/bleach/bleach/parse_shim.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from urllib import parse as parse

stubs/bleach/bleach/sanitizer.pyi

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,27 @@
11
from _typeshed import Incomplete
2-
from collections.abc import Callable, Iterable
2+
from collections.abc import Callable, Container, Iterable, Iterator
33
from re import Pattern
4-
from typing import Protocol
4+
from typing import Final, Protocol
55
from typing_extensions import TypeAlias
66

7+
from html5lib.filters.base import Filter
8+
from html5lib.filters.sanitizer import Filter as SanitizerFilter
9+
from html5lib.treewalkers.base import TreeWalker
10+
711
from . import _HTMLAttrKey
812
from .css_sanitizer import CSSSanitizer
9-
from .html5lib_shim import BleachHTMLParser, BleachHTMLSerializer, SanitizerFilter
13+
from .html5lib_shim import BleachHTMLParser, BleachHTMLSerializer
14+
from .linkifier import _Token
15+
16+
ALLOWED_TAGS: Final[frozenset[str]]
17+
ALLOWED_ATTRIBUTES: Final[dict[str, list[str]]]
18+
ALLOWED_PROTOCOLS: Final[frozenset[str]]
1019

11-
ALLOWED_TAGS: frozenset[str]
12-
ALLOWED_ATTRIBUTES: dict[str, list[str]]
13-
ALLOWED_PROTOCOLS: frozenset[str]
20+
INVISIBLE_CHARACTERS: Final[str]
21+
INVISIBLE_CHARACTERS_RE: Final[Pattern[str]]
22+
INVISIBLE_REPLACEMENT_CHAR: Final = "?"
1423

15-
INVISIBLE_CHARACTERS: str
16-
INVISIBLE_CHARACTERS_RE: Pattern[str]
17-
INVISIBLE_REPLACEMENT_CHAR: str
24+
class NoCssSanitizerWarning(UserWarning): ...
1825

1926
# A html5lib Filter class
2027
class _Filter(Protocol):
@@ -24,18 +31,16 @@ _AttributeFilter: TypeAlias = Callable[[str, str, str], bool]
2431
_AttributeDict: TypeAlias = dict[str, list[str] | _AttributeFilter] | dict[str, list[str]] | dict[str, _AttributeFilter]
2532
_Attributes: TypeAlias = _AttributeFilter | _AttributeDict | list[str]
2633

27-
_TreeWalker: TypeAlias = Callable[[Incomplete], Incomplete]
28-
2934
class Cleaner:
3035
tags: Iterable[str]
3136
attributes: _Attributes
3237
protocols: Iterable[str]
3338
strip: bool
3439
strip_comments: bool
35-
filters: Iterable[_Filter]
40+
filters: Iterable[Filter]
3641
css_sanitizer: CSSSanitizer | None
3742
parser: BleachHTMLParser
38-
walker: _TreeWalker
43+
walker: TreeWalker
3944
serializer: BleachHTMLSerializer
4045
def __init__(
4146
self,
@@ -63,7 +68,7 @@ class BleachSanitizerFilter(SanitizerFilter):
6368
css_sanitizer: CSSSanitizer | None
6469
def __init__(
6570
self,
66-
source,
71+
source: TreeWalker,
6772
allowed_tags: Iterable[str] = ...,
6873
attributes: _Attributes = ...,
6974
allowed_protocols: Iterable[str] = ...,
@@ -74,12 +79,11 @@ class BleachSanitizerFilter(SanitizerFilter):
7479
strip_html_comments: bool = True,
7580
css_sanitizer: CSSSanitizer | None = None,
7681
) -> None: ...
77-
def sanitize_stream(self, token_iterator): ...
78-
def merge_characters(self, token_iterator): ...
79-
def __iter__(self): ...
80-
def sanitize_token(self, token): ...
81-
def sanitize_characters(self, token): ...
82-
def sanitize_uri_value(self, value, allowed_protocols): ...
83-
def allow_token(self, token): ...
84-
def disallowed_token(self, token): ...
85-
def sanitize_css(self, style): ...
82+
def sanitize_stream(self, token_iterator: Iterable[_Token]) -> Iterator[_Token]: ...
83+
def merge_characters(self, token_iterator: Iterable[_Token]) -> Iterator[_Token]: ...
84+
def __iter__(self) -> Iterator[_Token]: ...
85+
def sanitize_token(self, token: _Token) -> _Token | list[_Token] | None: ...
86+
def sanitize_characters(self, token: _Token) -> _Token | list[_Token]: ...
87+
def sanitize_uri_value(self, value: str, allowed_protocols: Container[str]) -> str | None: ...
88+
def allow_token(self, token: _Token) -> _Token: ...
89+
def disallowed_token(self, token: _Token) -> _Token: ...

stubs/html5lib/html5lib/_inputstream.pyi

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
from _typeshed import Incomplete, SupportsRead
2-
from typing import Any, overload
2+
from codecs import CodecInfo
3+
from typing import Any, Protocol, overload
34
from typing_extensions import TypeAlias
45

6+
# Is actually webencodings.Encoding
7+
class _Encoding(Protocol):
8+
name: str
9+
codec_info: CodecInfo
10+
def __init__(self, name: str, codec_info: CodecInfo) -> None: ...
11+
512
_UnicodeInputStream: TypeAlias = str | SupportsRead[str]
613
_BinaryInputStream: TypeAlias = bytes | SupportsRead[bytes]
714
_InputStream: TypeAlias = _UnicodeInputStream # noqa: Y047 # used in other files
@@ -42,13 +49,13 @@ def HTMLInputStream(
4249
class HTMLUnicodeInputStream:
4350
reportCharacterErrors: Any
4451
newLines: Any
45-
charEncoding: Any
52+
charEncoding: tuple[_Encoding, str]
4653
dataStream: Any
4754
def __init__(self, source: _UnicodeInputStream) -> None: ...
4855
chunk: str
4956
chunkSize: int
5057
chunkOffset: int
51-
errors: Any
58+
errors: list[str]
5259
prevNumLines: int
5360
prevNumCols: int
5461
def reset(self) -> None: ...
@@ -70,7 +77,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
7077
same_origin_parent_encoding: Any
7178
likely_encoding: Any
7279
default_encoding: Any
73-
charEncoding: Any
80+
charEncoding: tuple[_Encoding, str]
7481
def __init__(
7582
self,
7683
source: _BinaryInputStream,
@@ -85,7 +92,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
8592
def reset(self) -> None: ...
8693
def openStream(self, source): ...
8794
def determineEncoding(self, chardet: bool = True): ...
88-
def changeEncoding(self, newEncoding) -> None: ...
95+
def changeEncoding(self, newEncoding: str | bytes | None) -> None: ...
8996
def detectBOM(self): ...
9097
def detectEncodingMeta(self): ...
9198

0 commit comments

Comments
 (0)