From 49660f7a30c1e1d92edebea7805a526cee029582 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 12 Feb 2023 19:07:16 -0600 Subject: [PATCH] BUG: iterparse on ignores repeated elements --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/io/xml.py | 9 ++++-- pandas/tests/io/xml/test_xml.py | 49 +++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 29f360e050548..e98ac9bdc2dae 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1304,6 +1304,7 @@ I/O - Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`) - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`) - Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`) +- Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`) Period ^^^^^^ diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 64fda2fdb0299..3ee4e64faf75c 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -337,6 +337,10 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: "local disk and not as compressed files or online sources." ) + iterparse_repeats = len(self.iterparse[row_node]) != len( + set(self.iterparse[row_node]) + ) + for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag @@ -345,12 +349,13 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: row = {} if row is not None: - if self.names: + if self.names and iterparse_repeats: for col, nm in zip(self.iterparse[row_node], self.names): if curr_elem == col: elem_val = elem.text.strip() if elem.text else None - if row.get(nm) != elem_val and nm not in row: + if elem_val not in row.values() and nm not in row: row[nm] = elem_val + if col in elem.attrib: if elem.attrib[col] not in row.values() and nm not in row: row[nm] = elem.attrib[col] diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index dfa251788ddc3..b73116519178e 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -948,6 +948,55 @@ def test_repeat_values_new_names(parser): tm.assert_frame_equal(df_iter, df_expected) +def test_repeat_elements(parser): + xml = """\ + + + circle + ellipse + 360 + 0 + + + triangle + polygon + 180 + 3 + + + square + polygon + 360 + 4 + +""" + df_xpath = read_xml( + xml, + xpath=".//shape", + parser=parser, + names=["name", "family", "degrees", "sides"], + ) + + df_iter = read_xml_iterparse( + xml, + parser=parser, + iterparse={"shape": ["value", "value", "value", "value"]}, + names=["name", "family", "degrees", "sides"], + ) + + df_expected = DataFrame( + { + "name": ["circle", "triangle", "square"], + "family": ["ellipse", "polygon", "polygon"], + "degrees": [360, 180, 360], + "sides": [0, 3, 4], + } + ) + + tm.assert_frame_equal(df_xpath, df_expected) + tm.assert_frame_equal(df_iter, df_expected) + + def test_names_option_wrong_length(datapath, parser): filename = datapath("io", "data", "xml", "books.xml")