diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 181b0fe115f4c..78fbeaad09300 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -413,11 +413,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: row = {} if row is not None: - for col in self.iterparse[row_node]: - if curr_elem == col: - row[col] = elem.text.strip() if elem.text else None - if col in elem.attrib: - row[col] = elem.attrib[col] + if self.names: + for col, nm in zip(self.iterparse[row_node], self.names): + if curr_elem == col: + elem_val = elem.text.strip() if elem.text else None + if elem_val not in row.values() and nm not in row: + row[nm] = elem_val + if col in elem.attrib: + if elem.attrib[col] not in row.values() and nm not in row: + row[nm] = elem.attrib[col] + else: + for col in self.iterparse[row_node]: + if curr_elem == col: + row[col] = elem.text.strip() if elem.text else None + if col in elem.attrib: + row[col] = elem.attrib[col] if event == "end": if curr_elem == row_node and row is not None: @@ -661,11 +671,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: row = {} if row is not None: - for col in self.iterparse[row_node]: - if curr_elem == col: - row[col] = elem.text.strip() if elem.text else None - if col in elem.attrib: - row[col] = elem.attrib[col] + if self.names: + for col, nm in zip(self.iterparse[row_node], self.names): + if curr_elem == col: + elem_val = elem.text.strip() if elem.text else None + if elem_val not in row.values() and nm not in row: + row[nm] = elem_val + if col in elem.attrib: + if elem.attrib[col] not in row.values() and nm not in row: + row[nm] = elem.attrib[col] + else: + for col in self.iterparse[row_node]: + if curr_elem == col: + row[col] = elem.text.strip() if elem.text else None + if col in elem.attrib: + row[col] = elem.attrib[col] if event == "end": if curr_elem == row_node and row is not None: @@ -1020,7 +1040,8 @@ def read_xml( names : list-like, optional Column names for DataFrame of parsed XML data. Use this parameter to - rename original element names and distinguish same named elements. + rename original element names and distinguish same named elements and + attributes. dtype : Type name or dict of column -> type, optional Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 277b6442a0a8c..eb2230bbf7fd5 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -789,6 +789,41 @@ def test_names_option_output(datapath, parser): tm.assert_frame_equal(df_iter, df_expected) +def test_repeat_names(parser): + xml = """\ + + + circle + curved + + + sphere + curved + +""" + df_xpath = read_xml( + xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"] + ) + + df_iter = read_xml_iterparse( + xml, + parser=parser, + iterparse={"shape": ["type", "name", "type"]}, + names=["type_dim", "shape", "type_edge"], + ) + + df_expected = DataFrame( + { + "type_dim": ["2D", "3D"], + "shape": ["circle", "sphere"], + "type_edge": ["curved", "curved"], + } + ) + + tm.assert_frame_equal(df_xpath, df_expected) + tm.assert_frame_equal(df_iter, df_expected) + + def test_names_option_wrong_length(datapath, parser): filename = datapath("io", "data", "xml", "books.xml")