Skip to content

Commit f3c00f8

Browse files
committed
bpo-37399: Correctly attach tail text to the last element/comment/pi, even when comments or pis are discarded.
Also fixes the insertion of PIs when "insert_pis=True" is configured for a TreeBuilder.
1 parent d395209 commit f3c00f8

File tree

2 files changed

+113
-16
lines changed

2 files changed

+113
-16
lines changed

Lib/test/test_xml_etree.py

+60
Original file line numberDiff line numberDiff line change
@@ -2954,6 +2954,66 @@ def test_treebuilder_pi(self):
29542954
self.assertEqual(b.pi('target'), (len('target'), None))
29552955
self.assertEqual(b.pi('pitarget', ' text '), (len('pitarget'), ' text '))
29562956

2957+
def test_late_tail(self):
2958+
# Issue #37399: The tail of an ignored comment could overwrite the text before it.
2959+
class TreeBuilderSubclass(ET.TreeBuilder):
2960+
pass
2961+
2962+
xml = "<a>text<!-- comment -->tail</a>"
2963+
a = ET.fromstring(xml)
2964+
self.assertEqual(a.text, "texttail")
2965+
2966+
parser = ET.XMLParser(target=TreeBuilderSubclass())
2967+
parser.feed(xml)
2968+
a = parser.close()
2969+
self.assertEqual(a.text, "texttail")
2970+
2971+
xml = "<a>text<?pi data?>tail</a>"
2972+
a = ET.fromstring(xml)
2973+
self.assertEqual(a.text, "texttail")
2974+
2975+
xml = "<a>text<?pi data?>tail</a>"
2976+
parser = ET.XMLParser(target=TreeBuilderSubclass())
2977+
parser.feed(xml)
2978+
a = parser.close()
2979+
self.assertEqual(a.text, "texttail")
2980+
2981+
def test_late_tail_mix_pi_comments(self):
2982+
# Issue #37399: The tail of an ignored comment could overwrite the text before it.
2983+
# Test appending tails to comments/pis.
2984+
class TreeBuilderSubclass(ET.TreeBuilder):
2985+
pass
2986+
2987+
xml = "<a>text<?pi1?><!-- comment --><?pi2?>tail</a>"
2988+
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
2989+
parser.feed(xml)
2990+
a = parser.close()
2991+
self.assertEqual(a[0].text, ' comment ')
2992+
self.assertEqual(a[0].tail, 'tail')
2993+
self.assertEqual(a.text, "text")
2994+
2995+
parser = ET.XMLParser(target=TreeBuilderSubclass(insert_comments=True))
2996+
parser.feed(xml)
2997+
a = parser.close()
2998+
self.assertEqual(a[0].text, ' comment ')
2999+
self.assertEqual(a[0].tail, 'tail')
3000+
self.assertEqual(a.text, "text")
3001+
3002+
xml = "<a>text<!-- comment --><?pi data?>tail</a>"
3003+
parser = ET.XMLParser(target=ET.TreeBuilder(insert_pis=True))
3004+
parser.feed(xml)
3005+
a = parser.close()
3006+
self.assertEqual(a[0].text, 'pi data')
3007+
self.assertEqual(a[0].tail, 'tail')
3008+
self.assertEqual(a.text, "text")
3009+
3010+
parser = ET.XMLParser(target=TreeBuilderSubclass(insert_pis=True))
3011+
parser.feed(xml)
3012+
a = parser.close()
3013+
self.assertEqual(a[0].text, 'pi data')
3014+
self.assertEqual(a[0].tail, 'tail')
3015+
self.assertEqual(a.text, "text")
3016+
29573017
def test_treebuilder_elementfactory_none(self):
29583018
parser = ET.XMLParser(target=ET.TreeBuilder(element_factory=None))
29593019
parser.feed(self.sample1)

Modules/_elementtree.c

+53-16
Original file line numberDiff line numberDiff line change
@@ -2399,6 +2399,7 @@ typedef struct {
23992399

24002400
PyObject *this; /* current node */
24012401
PyObject *last; /* most recently created node */
2402+
PyObject *last_for_tail; /* most recently created node that takes a tail */
24022403

24032404
PyObject *data; /* data collector (string or list), or NULL */
24042405

@@ -2530,6 +2531,7 @@ treebuilder_gc_traverse(TreeBuilderObject *self, visitproc visit, void *arg)
25302531
Py_VISIT(self->root);
25312532
Py_VISIT(self->this);
25322533
Py_VISIT(self->last);
2534+
Py_VISIT(self->last_for_tail);
25332535
Py_VISIT(self->data);
25342536
Py_VISIT(self->stack);
25352537
Py_VISIT(self->pi_factory);
@@ -2551,6 +2553,7 @@ treebuilder_gc_clear(TreeBuilderObject *self)
25512553
Py_CLEAR(self->stack);
25522554
Py_CLEAR(self->data);
25532555
Py_CLEAR(self->last);
2556+
Py_CLEAR(self->last_for_tail);
25542557
Py_CLEAR(self->this);
25552558
Py_CLEAR(self->pi_factory);
25562559
Py_CLEAR(self->comment_factory);
@@ -2622,21 +2625,48 @@ _elementtree__set_factories_impl(PyObject *module, PyObject *comment_factory,
26222625
}
26232626

26242627
static int
2625-
treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data,
2626-
PyObject **dest, _Py_Identifier *name)
2628+
treebuilder_extend_element_text_or_tail(PyObject *element, PyObject **data,
2629+
PyObject **dest, _Py_Identifier *name)
26272630
{
2631+
/* Fast paths for the "almost always" cases. */
26282632
if (Element_CheckExact(element)) {
2629-
PyObject *tmp = JOIN_OBJ(*dest);
2630-
*dest = JOIN_SET(*data, PyList_CheckExact(*data));
2631-
*data = NULL;
2632-
Py_DECREF(tmp);
2633-
return 0;
2633+
PyObject *dest_obj = JOIN_OBJ(*dest);
2634+
if (dest_obj == Py_None) {
2635+
*dest = JOIN_SET(*data, PyList_CheckExact(*data));
2636+
*data = NULL;
2637+
Py_DECREF(dest_obj);
2638+
return 0;
2639+
}
2640+
else if (JOIN_GET(*dest)) {
2641+
if (PyList_SetSlice(dest_obj, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, *data) < 0) {
2642+
return -1;
2643+
}
2644+
Py_CLEAR(*data);
2645+
return 0;
2646+
}
26342647
}
2635-
else {
2636-
PyObject *joined = list_join(*data);
2648+
2649+
/* Fallback for the non-Element / non-trivial cases. */
2650+
{
26372651
int r;
2638-
if (joined == NULL)
2652+
PyObject *joined, *previous = _PyObject_GetAttrId(element, name);
2653+
if (!previous)
26392654
return -1;
2655+
joined = list_join(*data);
2656+
if (!joined) {
2657+
Py_DECREF(previous);
2658+
return -1;
2659+
}
2660+
if (previous != Py_None) {
2661+
PyObject *tmp = PyNumber_Add(previous, joined);
2662+
Py_DECREF(joined);
2663+
if (!tmp) {
2664+
Py_DECREF(previous);
2665+
return -1;
2666+
}
2667+
joined = tmp;
2668+
}
2669+
26402670
r = _PyObject_SetAttrId(element, name, joined);
26412671
Py_DECREF(joined);
26422672
if (r < 0)
@@ -2649,21 +2679,21 @@ treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data,
26492679
LOCAL(int)
26502680
treebuilder_flush_data(TreeBuilderObject* self)
26512681
{
2652-
PyObject *element = self->last;
2653-
26542682
if (!self->data) {
26552683
return 0;
26562684
}
26572685

2658-
if (self->this == element) {
2686+
if (!self->last_for_tail) {
2687+
PyObject *element = self->last;
26592688
_Py_IDENTIFIER(text);
2660-
return treebuilder_set_element_text_or_tail(
2689+
return treebuilder_extend_element_text_or_tail(
26612690
element, &self->data,
26622691
&((ElementObject *) element)->text, &PyId_text);
26632692
}
26642693
else {
2694+
PyObject *element = self->last_for_tail;
26652695
_Py_IDENTIFIER(tail);
2666-
return treebuilder_set_element_text_or_tail(
2696+
return treebuilder_extend_element_text_or_tail(
26672697
element, &self->data,
26682698
&((ElementObject *) element)->tail, &PyId_tail);
26692699
}
@@ -2739,6 +2769,7 @@ treebuilder_handle_start(TreeBuilderObject* self, PyObject* tag,
27392769
}
27402770

27412771
this = self->this;
2772+
Py_CLEAR(self->last_for_tail);
27422773

27432774
if (this != Py_None) {
27442775
if (treebuilder_add_subelement(this, node) < 0)
@@ -2836,6 +2867,8 @@ treebuilder_handle_end(TreeBuilderObject* self, PyObject* tag)
28362867

28372868
item = self->last;
28382869
self->last = self->this;
2870+
Py_INCREF(self->last);
2871+
Py_XSETREF(self->last_for_tail, self->last);
28392872
self->index--;
28402873
self->this = PyList_GET_ITEM(self->stack, self->index);
28412874
Py_INCREF(self->this);
@@ -2867,6 +2900,8 @@ treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text)
28672900
if (self->insert_comments && this != Py_None) {
28682901
if (treebuilder_add_subelement(this, comment) < 0)
28692902
goto error;
2903+
Py_INCREF(comment);
2904+
Py_XSETREF(self->last_for_tail, comment);
28702905
}
28712906
} else {
28722907
Py_INCREF(text);
@@ -2906,6 +2941,8 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
29062941
if (self->insert_pis && this != Py_None) {
29072942
if (treebuilder_add_subelement(this, pi) < 0)
29082943
goto error;
2944+
Py_INCREF(pi);
2945+
Py_XSETREF(self->last_for_tail, pi);
29092946
}
29102947
} else {
29112948
pi = PyTuple_Pack(2, target, text);
@@ -3599,7 +3636,7 @@ expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
35993636
/* shortcut */
36003637
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
36013638

3602-
if (target->events_append && target->pi_event_obj) {
3639+
if (target->events_append && target->pi_event_obj || target->insert_pis) {
36033640
pi_target = PyUnicode_DecodeUTF8(target_in, strlen(target_in), "strict");
36043641
if (!pi_target)
36053642
goto error;

0 commit comments

Comments
 (0)