Skip to content
This repository was archived by the owner on Feb 13, 2025. It is now read-only.

Commit 6f2bb98

Browse files
committed
python#23144: Make sure that HTMLParser.feed() returns all the data, even when convert_charrefs is True.
1 parent 527ef07 commit 6f2bb98

File tree

3 files changed

+25
-5
lines changed

3 files changed

+25
-5
lines changed

Lib/html/parser.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,15 @@ def goahead(self, end):
198198
if self.convert_charrefs and not self.cdata_elem:
199199
j = rawdata.find('<', i)
200200
if j < 0:
201-
if not end:
201+
# if we can't find the next <, either we are at the end
202+
# or there's more text incoming. If the latter is True,
203+
# we can't pass the text to handle_data in case we have
204+
# a charref cut in half at end. Try to determine if
205+
# this is the case before proceding by looking for an
206+
# & near the end and see if it's followed by a space or ;.
207+
amppos = rawdata.rfind('&', max(i, n-34))
208+
if (amppos >= 0 and
209+
not re.compile(r'[\s;]').search(rawdata, amppos)):
202210
break # wait till we get all the text
203211
j = n
204212
else:

Lib/test/test_htmlparser.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,6 @@ def handle_starttag(self, tag, attrs):
7272

7373
class EventCollectorCharrefs(EventCollector):
7474

75-
def get_events(self):
76-
return self.events
77-
7875
def handle_charref(self, data):
7976
self.fail('This should never be called with convert_charrefs=True')
8077

@@ -685,6 +682,18 @@ def test_broken_condcoms(self):
685682
]
686683
self._run_check(html, expected)
687684

685+
def test_convert_charrefs_dropped_text(self):
686+
# #23144: make sure that all the events are triggered when
687+
# convert_charrefs is True, even if we don't call .close()
688+
parser = EventCollector(convert_charrefs=True)
689+
# before the fix, bar & baz was missing
690+
parser.feed("foo <a>link</a> bar &amp; baz")
691+
self.assertEqual(
692+
parser.get_events(),
693+
[('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
694+
('endtag', 'a'), ('data', ' bar & baz')]
695+
)
696+
688697

689698
class AttributesStrictTestCase(TestCaseBase):
690699

Misc/NEWS

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
+++++++++++
1+
+++++++++++
22
Python News
33
+++++++++++
44

@@ -81,6 +81,9 @@ Core and Builtins
8181
Library
8282
-------
8383

84+
- Issue #23144: Make sure that HTMLParser.feed() returns all the data, even
85+
when convert_charrefs is True.
86+
8487
- Issue #16180: Exit pdb if file has syntax error, instead of trapping user
8588
in an infinite loop. Patch by Xavier de Gaye.
8689

0 commit comments

Comments
 (0)