4
4
5
5
import re
6
6
from html .entities import name2codepoint
7
- from typing import Match , Sequence , AnyStr
7
+ from typing import Iterable , Match , AnyStr , Optional , Pattern , Tuple , Union
8
8
from urllib .parse import urljoin
9
9
10
10
from w3lib .util import to_unicode
19
19
HTML5_WHITESPACE = ' \t \n \r \x0c '
20
20
21
21
22
- def replace_entities (text : AnyStr , keep : Sequence [str ] = (), remove_illegal : bool = True , encoding : str = 'utf-8' ):
22
+ def replace_entities (text : AnyStr , keep : Iterable [str ] = (), remove_illegal : bool = True , encoding : str = 'utf-8' ) -> str :
23
23
"""Remove entities from the given `text` by converting them to their
24
24
corresponding unicode character.
25
25
@@ -47,7 +47,7 @@ def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: boo
47
47
48
48
"""
49
49
50
- def convert_entity (m : Match ):
50
+ def convert_entity (m : Match ) -> str :
51
51
groups = m .groupdict ()
52
52
number = None
53
53
if groups .get ('dec' ):
@@ -80,10 +80,10 @@ def convert_entity(m: Match):
80
80
81
81
return _ent_re .sub (convert_entity , to_unicode (text , encoding ))
82
82
83
- def has_entities (text : AnyStr , encoding = None ):
83
+ def has_entities (text : AnyStr , encoding : Optional [ str ] = None ) -> bool :
84
84
return bool (_ent_re .search (to_unicode (text , encoding )))
85
85
86
- def replace_tags (text , token = '' , encoding = None ):
86
+ def replace_tags (text : AnyStr , token : str = '' , encoding : Optional [ str ] = None ) -> str :
87
87
"""Replace all markup tags found in the given `text` by the given token.
88
88
By default `token` is an empty string so it just removes all tags.
89
89
@@ -107,7 +107,7 @@ def replace_tags(text, token='', encoding=None):
107
107
108
108
109
109
_REMOVECOMMENTS_RE = re .compile ('<!--.*?(?:-->|$)' , re .DOTALL )
110
- def remove_comments (text , encoding = None ):
110
+ def remove_comments (text : AnyStr , encoding : Optional [ str ] = None ) -> str :
111
111
""" Remove HTML Comments.
112
112
113
113
>>> import w3lib.html
@@ -117,10 +117,10 @@ def remove_comments(text, encoding=None):
117
117
118
118
"""
119
119
120
- text = to_unicode (text , encoding )
121
- return _REMOVECOMMENTS_RE .sub ('' , text )
120
+ utext = to_unicode (text , encoding )
121
+ return _REMOVECOMMENTS_RE .sub ('' , utext )
122
122
123
- def remove_tags (text , which_ones = (), keep = (), encoding = None ):
123
+ def remove_tags (text : AnyStr , which_ones : Iterable [ str ] = (), keep : Iterable [ str ] = (), encoding : Optional [ str ] = None ) -> str :
124
124
""" Remove HTML Tags only.
125
125
126
126
`which_ones` and `keep` are both tuples, there are four cases:
@@ -170,14 +170,14 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None):
170
170
which_ones = {tag .lower () for tag in which_ones }
171
171
keep = {tag .lower () for tag in keep }
172
172
173
- def will_remove (tag ) :
173
+ def will_remove (tag : str ) -> bool :
174
174
tag = tag .lower ()
175
175
if which_ones :
176
176
return tag in which_ones
177
177
else :
178
178
return tag not in keep
179
179
180
- def remove_tag (m ) :
180
+ def remove_tag (m : Match ) -> str :
181
181
tag = m .group (1 )
182
182
return '' if will_remove (tag ) else m .group (0 )
183
183
@@ -186,7 +186,7 @@ def remove_tag(m):
186
186
187
187
return retags .sub (remove_tag , to_unicode (text , encoding ))
188
188
189
- def remove_tags_with_content (text , which_ones = (), encoding = None ):
189
+ def remove_tags_with_content (text : AnyStr , which_ones : Iterable [ str ] = (), encoding : Optional [ str ] = None ) -> str :
190
190
"""Remove tags and their content.
191
191
192
192
`which_ones` is a tuple of which tags to remove including their content.
@@ -200,16 +200,16 @@ def remove_tags_with_content(text, which_ones=(), encoding=None):
200
200
201
201
"""
202
202
203
- text = to_unicode (text , encoding )
203
+ utext = to_unicode (text , encoding )
204
204
if which_ones :
205
205
tags = '|' .join ([r'<%s\b.*?</%s>|<%s\s*/>' % (tag , tag , tag ) for tag in which_ones ])
206
206
retags = re .compile (tags , re .DOTALL | re .IGNORECASE )
207
- text = retags .sub ('' , text )
208
- return text
207
+ utext = retags .sub ('' , utext )
208
+ return utext
209
209
210
210
211
- def replace_escape_chars (text , which_ones = ('\n ' , '\t ' , '\r ' ), replace_by = '' , \
212
- encoding = None ):
211
+ def replace_escape_chars (text : AnyStr , which_ones : Iterable [ str ] = ('\n ' , '\t ' , '\r ' ), replace_by : str = '' , \
212
+ encoding : Optional [ str ] = None ) -> str :
213
213
"""Remove escape characters.
214
214
215
215
`which_ones` is a tuple of which escape characters we want to remove.
@@ -220,12 +220,12 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', \
220
220
221
221
"""
222
222
223
- text = to_unicode (text , encoding )
223
+ utext = to_unicode (text , encoding )
224
224
for ec in which_ones :
225
- text = text .replace (ec , to_unicode (replace_by , encoding ))
226
- return text
225
+ utext = utext .replace (ec , to_unicode (replace_by , encoding ))
226
+ return utext
227
227
228
- def unquote_markup (text , keep = (), remove_illegal = True , encoding = None ):
228
+ def unquote_markup (text : AnyStr , keep : Iterable [ str ] = (), remove_illegal : bool = True , encoding : Optional [ str ] = None ) -> str :
229
229
"""
230
230
This function receives markup as a text (always a unicode string or
231
231
a UTF-8 encoded string) and does the following:
@@ -237,7 +237,7 @@ def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
237
237
238
238
"""
239
239
240
- def _get_fragments (txt , pattern ) :
240
+ def _get_fragments (txt : str , pattern : Pattern ) -> Iterable [ Union [ str , Match ]] :
241
241
offset = 0
242
242
for match in pattern .finditer (txt ):
243
243
match_s , match_e = match .span (1 )
@@ -246,9 +246,9 @@ def _get_fragments(txt, pattern):
246
246
offset = match_e
247
247
yield txt [offset :]
248
248
249
- text = to_unicode (text , encoding )
249
+ utext = to_unicode (text , encoding )
250
250
ret_text = ''
251
- for fragment in _get_fragments (text , _cdata_re ):
251
+ for fragment in _get_fragments (utext , _cdata_re ):
252
252
if isinstance (fragment , str ):
253
253
# it's not a CDATA (so we try to remove its entities)
254
254
ret_text += replace_entities (fragment , keep = keep , remove_illegal = remove_illegal )
@@ -257,16 +257,16 @@ def _get_fragments(txt, pattern):
257
257
ret_text += fragment .group ('cdata_d' )
258
258
return ret_text
259
259
260
- def get_base_url (text , baseurl = '' , encoding = 'utf-8' ):
260
+ def get_base_url (text : AnyStr , baseurl : str = '' , encoding : str = 'utf-8' ) -> str :
261
261
"""Return the base url if declared in the given HTML `text`,
262
262
relative to the given base url.
263
263
264
264
If no base url is found, the given `baseurl` is returned.
265
265
266
266
"""
267
267
268
- text = to_unicode (text , encoding )
269
- m = _baseurl_re .search (text )
268
+ utext = to_unicode (text , encoding )
269
+ m = _baseurl_re .search (utext )
270
270
if m :
271
271
return urljoin (
272
272
safe_url_string (baseurl ),
@@ -275,7 +275,7 @@ def get_base_url(text, baseurl='', encoding='utf-8'):
275
275
else :
276
276
return safe_url_string (baseurl )
277
277
278
- def get_meta_refresh (text , baseurl = '' , encoding = 'utf-8' , ignore_tags = ('script' , 'noscript' )):
278
+ def get_meta_refresh (text : AnyStr , baseurl : str = '' , encoding : str = 'utf-8' , ignore_tags : Iterable [ str ] = ('script' , 'noscript' )) -> Tuple [ Optional [ float ], Optional [ str ]] :
279
279
"""Return the http-equiv parameter of the HTML meta element from the given
280
280
HTML text and return a tuple ``(interval, url)`` where interval is an integer
281
281
containing the delay in seconds (or zero if not present) and url is a
@@ -286,13 +286,13 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
286
286
"""
287
287
288
288
try :
289
- text = to_unicode (text , encoding )
289
+ utext = to_unicode (text , encoding )
290
290
except UnicodeDecodeError :
291
291
print (text )
292
292
raise
293
- text = remove_tags_with_content (text , ignore_tags )
294
- text = remove_comments (replace_entities (text ))
295
- m = _meta_refresh_re .search (text )
293
+ utext = remove_tags_with_content (utext , ignore_tags )
294
+ utext = remove_comments (replace_entities (utext ))
295
+ m = _meta_refresh_re .search (utext )
296
296
if m :
297
297
interval = float (m .group ('int' ))
298
298
url = safe_url_string (m .group ('url' ).strip (' "\' ' ), encoding )
@@ -302,7 +302,7 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
302
302
return None , None
303
303
304
304
305
- def strip_html5_whitespace (text ) :
305
+ def strip_html5_whitespace (text : str ) -> str :
306
306
r"""
307
307
Strip all leading and trailing space characters (as defined in
308
308
https://www.w3.org/TR/html5/infrastructure.html#space-character).
0 commit comments