From 4aa32cc3ff382843d0d3a5c56402c70e20b3b63f Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Fri, 17 Jun 2022 23:16:04 -0500
Subject: [PATCH 1/5] BUG: iterparse of read_xml not parsing duplicate element
 and attribute names

---
 doc/source/whatsnew/v1.5.0.rst  |  1 +
 pandas/io/xml.py                | 40 ++++++++++++++++++++++++---------
 pandas/tests/io/xml/test_xml.py | 35 +++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 76f6e864a174f..75a51cfbc506a 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -874,6 +874,7 @@ I/O
 - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
 - Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`)
 - Bug in :meth:`DataFrame.to_excel` when writing an empty dataframe with :class:`MultiIndex` (:issue:`19543`)
+- Bug in :func:`read_xml` when reading XML with duplicate element and attribute names (:issue:`47343`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 181b0fe115f4c..2c6b20058122b 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -413,11 +413,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
                     row = {}
 
             if row is not None:
-                for col in self.iterparse[row_node]:
-                    if curr_elem == col:
-                        row[col] = elem.text.strip() if elem.text else None
-                    if col in elem.attrib:
-                        row[col] = elem.attrib[col]
+                if self.names:
+                    for col, nm in zip(self.iterparse[row_node], self.names):
+                        if curr_elem == col:
+                            elem_val = elem.text.strip() if elem.text else None
+                            if elem_val not in row.values() and nm not in row:
+                                row[nm] = elem_val
+                        if col in elem.attrib:
+                            if elem.attrib[col] not in row.values() and nm not in row:
+                                row[nm] = elem.attrib[col]
+                else:
+                    for col in self.iterparse[row_node]:
+                        if curr_elem == col:
+                            row[col] = elem.text.strip() if elem.text else None
+                        if col in elem.attrib:
+                            row[col] = elem.attrib[col]
 
             if event == "end":
                 if curr_elem == row_node and row is not None:
@@ -661,11 +671,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
                     row = {}
 
             if row is not None:
-                for col in self.iterparse[row_node]:
-                    if curr_elem == col:
-                        row[col] = elem.text.strip() if elem.text else None
-                    if col in elem.attrib:
-                        row[col] = elem.attrib[col]
+                if self.names:
+                    for col, nm in zip(self.iterparse[row_node], self.names):
+                        if curr_elem == col:
+                            elem_val = elem.text.strip() if elem.text else None
+                            if elem_val not in row.values() and nm not in row:
+                                row[nm] = elem_val
+                        if col in elem.attrib:
+                            if elem.attrib[col] not in row.values() and nm not in row:
+                                row[nm] = elem.attrib[col]
+                else:
+                    for col in self.iterparse[row_node]:
+                        if curr_elem == col:
+                            row[col] = elem.text.strip() if elem.text else None
+                        if col in elem.attrib:
+                            row[col] = elem.attrib[col]
 
             if event == "end":
                 if curr_elem == row_node and row is not None:
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 277b6442a0a8c..eb2230bbf7fd5 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -789,6 +789,41 @@ def test_names_option_output(datapath, parser):
     tm.assert_frame_equal(df_iter, df_expected)
 
 
+def test_repeat_names(parser):
+    xml = """\
+<shapes>
+  <shape type="2D">
+    <name>circle</name>
+    <type>curved</type>
+  </shape>
+  <shape type="3D">
+    <name>sphere</name>
+    <type>curved</type>
+  </shape>
+</shapes>"""
+    df_xpath = read_xml(
+        xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"]
+    )
+
+    df_iter = read_xml_iterparse(
+        xml,
+        parser=parser,
+        iterparse={"shape": ["type", "name", "type"]},
+        names=["type_dim", "shape", "type_edge"],
+    )
+
+    df_expected = DataFrame(
+        {
+            "type_dim": ["2D", "3D"],
+            "shape": ["circle", "sphere"],
+            "type_edge": ["curved", "curved"],
+        }
+    )
+
+    tm.assert_frame_equal(df_xpath, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
 def test_names_option_wrong_length(datapath, parser):
     filename = datapath("io", "data", "xml", "books.xml")
 

From 65c09d57d664ab0e9d8e0570179121db9c7a0e1f Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Sat, 18 Jun 2022 16:49:52 -0500
Subject: [PATCH 2/5] Refactor duplicative code in each parser to shared base
 class

---
 pandas/io/xml.py | 366 ++++++++++++++---------------------------------
 1 file changed, 107 insertions(+), 259 deletions(-)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 2c6b20058122b..d97e840524116 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -177,7 +177,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
 
         raise AbstractMethodError(self)
 
-    def _parse_nodes(self) -> list[dict[str, str | None]]:
+    def _parse_nodes(self, elems) -> list[dict[str, str | None]]:
         """
         Parse xml nodes.
 
@@ -197,102 +197,6 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
         will have optional keys filled with None values.
         """
 
-        raise AbstractMethodError(self)
-
-    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
-        """
-        Iterparse xml nodes.
-
-        This method will read in local disk, decompressed XML files for elements
-        and underlying descendants using iterparse, a method to iterate through
-        an XML tree without holding entire XML tree in memory.
-
-        Raises
-        ------
-        TypeError
-            * If `iterparse` is not a dict or its dict value is not list-like.
-        ParserError
-            * If `path_or_buffer` is not a physical, decompressed file on disk.
-            * If no data is returned from selected items in `iterparse`.
-
-        Notes
-        -----
-        Namespace URIs will be removed from return node values. Also,
-        elements with missing children or attributes in submitted list
-        will have optional keys filled with None values.
-        """
-
-        raise AbstractMethodError(self)
-
-    def _validate_path(self) -> None:
-        """
-        Validate xpath.
-
-        This method checks for syntax, evaluation, or empty nodes return.
-
-        Raises
-        ------
-        SyntaxError
-            * If xpah is not supported or issues with namespaces.
-
-        ValueError
-            * If xpah does not return any nodes.
-        """
-
-        raise AbstractMethodError(self)
-
-    def _validate_names(self) -> None:
-        """
-        Validate names.
-
-        This method will check if names is a list-like and aligns
-        with length of parse nodes.
-
-        Raises
-        ------
-        ValueError
-            * If value is not a list and less then length of nodes.
-        """
-        raise AbstractMethodError(self)
-
-    def _parse_doc(self, raw_doc) -> bytes:
-        """
-        Build tree from path_or_buffer.
-
-        This method will parse XML object into tree
-        either from string/bytes or file location.
-        """
-        raise AbstractMethodError(self)
-
-
-class _EtreeFrameParser(_XMLFrameParser):
-    """
-    Internal class to parse XML into DataFrames with the Python
-    standard library XML module: `xml.etree.ElementTree`.
-    """
-
-    def parse_data(self) -> list[dict[str, str | None]]:
-        from xml.etree.ElementTree import XML
-
-        if self.stylesheet is not None:
-            raise ValueError(
-                "To use stylesheet, you need lxml installed and selected as parser."
-            )
-
-        if self.iterparse is None:
-            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
-            self._validate_path()
-
-        self._validate_names()
-
-        xml_dicts: list[dict[str, str | None]] = (
-            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
-        )
-
-        return xml_dicts
-
-    def _parse_nodes(self) -> list[dict[str, str | None]]:
-        elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
         dicts: list[dict[str, str | None]]
 
         if self.elems_only and self.attrs_only:
@@ -375,8 +279,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
 
         return dicts
 
-    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
-        from xml.etree.ElementTree import iterparse
+    def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]:
+        """
+        Iterparse xml nodes.
+
+        This method will read in local disk, decompressed XML files for elements
+        and underlying descendants using iterparse, a method to iterate through
+        an XML tree without holding entire XML tree in memory.
+
+        Raises
+        ------
+        TypeError
+            * If `iterparse` is not a dict or its dict value is not list-like.
+        ParserError
+            * If `path_or_buffer` is not a physical, decompressed file on disk.
+            * If no data is returned from selected items in `iterparse`.
+
+        Notes
+        -----
+        Namespace URIs will be removed from return node values. Also,
+        elements with missing children or attributes in submitted list
+        will have optional keys filled with None values.
+        """
 
         dicts: list[dict[str, str | None]] = []
         row: dict[str, str | None] | None = None
@@ -433,6 +357,7 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
                 if curr_elem == row_node and row is not None:
                     dicts.append(row)
                     row = None
+
                 elem.clear()
 
         if dicts == []:
@@ -446,6 +371,79 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
 
         return dicts
 
+    def _validate_path(self) -> None:
+        """
+        Validate xpath.
+
+        This method checks for syntax, evaluation, or empty nodes return.
+
+        Raises
+        ------
+        SyntaxError
+            * If xpah is not supported or issues with namespaces.
+
+        ValueError
+            * If xpah does not return any nodes.
+        """
+
+        raise AbstractMethodError(self)
+
+    def _validate_names(self) -> None:
+        """
+        Validate names.
+
+        This method will check if names is a list-like and aligns
+        with length of parse nodes.
+
+        Raises
+        ------
+        ValueError
+            * If value is not a list and less then length of nodes.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_doc(self, raw_doc) -> bytes:
+        """
+        Build tree from path_or_buffer.
+
+        This method will parse XML object into tree
+        either from string/bytes or file location.
+        """
+        raise AbstractMethodError(self)
+
+
+class _EtreeFrameParser(_XMLFrameParser):
+    """
+    Internal class to parse XML into DataFrames with the Python
+    standard library XML module: `xml.etree.ElementTree`.
+    """
+
+    def parse_data(self) -> list[dict[str, str | None]]:
+        from xml.etree.ElementTree import (
+            XML,
+            iterparse,
+        )
+
+        if self.stylesheet is not None:
+            raise ValueError(
+                "To use stylesheet, you need lxml installed and selected as parser."
+            )
+
+        if self.iterparse is None:
+            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+            self._validate_path()
+            elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
+
+        self._validate_names()
+
+        xml_dicts: list[dict[str, str | None]] = (
+            self._parse_nodes(elems)
+            if self.iterparse is None
+            else self._iterparse_nodes(iterparse)
+        )
+
+        return xml_dicts
+
     def _validate_path(self) -> None:
         """
         Notes
@@ -531,7 +529,10 @@ def parse_data(self) -> list[dict[str, str | None]]:
         validate xpath, names, optionally parse and run XSLT,
         and parse original or transformed XML and return specific nodes.
         """
-        from lxml.etree import XML
+        from lxml.etree import (
+            XML,
+            iterparse,
+        )
 
         if self.iterparse is None:
             self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
@@ -541,172 +542,18 @@ def parse_data(self) -> list[dict[str, str | None]]:
                 self.xml_doc = XML(self._transform_doc())
 
             self._validate_path()
+            elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
 
         self._validate_names()
 
         xml_dicts: list[dict[str, str | None]] = (
-            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
+            self._parse_nodes(elems)
+            if self.iterparse is None
+            else self._iterparse_nodes(iterparse)
         )
 
         return xml_dicts
 
-    def _parse_nodes(self) -> list[dict[str, str | None]]:
-        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
-        dicts: list[dict[str, str | None]]
-
-        if self.elems_only and self.attrs_only:
-            raise ValueError("Either element or attributes can be parsed not both.")
-
-        elif self.elems_only:
-            if self.names:
-                dicts = [
-                    {
-                        **(
-                            {el.tag: el.text.strip()}
-                            if el.text and not el.text.isspace()
-                            else {}
-                        ),
-                        **{
-                            nm: ch.text.strip() if ch.text else None
-                            for nm, ch in zip(self.names, el.xpath("*"))
-                        },
-                    }
-                    for el in elems
-                ]
-            else:
-                dicts = [
-                    {
-                        ch.tag: ch.text.strip() if ch.text else None
-                        for ch in el.xpath("*")
-                    }
-                    for el in elems
-                ]
-
-        elif self.attrs_only:
-            dicts = [el.attrib for el in elems]
-
-        else:
-            if self.names:
-                dicts = [
-                    {
-                        **el.attrib,
-                        **(
-                            {el.tag: el.text.strip()}
-                            if el.text and not el.text.isspace()
-                            else {}
-                        ),
-                        **{
-                            nm: ch.text.strip() if ch.text else None
-                            for nm, ch in zip(self.names, el.xpath("*"))
-                        },
-                    }
-                    for el in elems
-                ]
-            else:
-                dicts = [
-                    {
-                        **el.attrib,
-                        **(
-                            {el.tag: el.text.strip()}
-                            if el.text and not el.text.isspace()
-                            else {}
-                        ),
-                        **{
-                            ch.tag: ch.text.strip() if ch.text else None
-                            for ch in el.xpath("*")
-                        },
-                    }
-                    for el in elems
-                ]
-
-        if self.namespaces or "}" in list(dicts[0].keys())[0]:
-            dicts = [
-                {k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
-                for d in dicts
-            ]
-
-        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
-        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
-
-        if self.names:
-            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
-
-        return dicts
-
-    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
-        from lxml.etree import iterparse
-
-        dicts: list[dict[str, str | None]] = []
-        row: dict[str, str | None] | None = None
-
-        if not isinstance(self.iterparse, dict):
-            raise TypeError(
-                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
-            )
-
-        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
-        if not is_list_like(self.iterparse[row_node]):
-            raise TypeError(
-                f"{type(self.iterparse[row_node])} is not a valid type "
-                "for value in iterparse"
-            )
-
-        if (
-            not isinstance(self.path_or_buffer, str)
-            or is_url(self.path_or_buffer)
-            or is_fsspec_url(self.path_or_buffer)
-            or self.path_or_buffer.startswith(("<?xml", "<"))
-            or infer_compression(self.path_or_buffer, "infer") is not None
-        ):
-            raise ParserError(
-                "iterparse is designed for large XML files that are fully extracted on "
-                "local disk and not as compressed files or online sources."
-            )
-
-        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
-            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
-
-            if event == "start":
-                if curr_elem == row_node:
-                    row = {}
-
-            if row is not None:
-                if self.names:
-                    for col, nm in zip(self.iterparse[row_node], self.names):
-                        if curr_elem == col:
-                            elem_val = elem.text.strip() if elem.text else None
-                            if elem_val not in row.values() and nm not in row:
-                                row[nm] = elem_val
-                        if col in elem.attrib:
-                            if elem.attrib[col] not in row.values() and nm not in row:
-                                row[nm] = elem.attrib[col]
-                else:
-                    for col in self.iterparse[row_node]:
-                        if curr_elem == col:
-                            row[col] = elem.text.strip() if elem.text else None
-                        if col in elem.attrib:
-                            row[col] = elem.attrib[col]
-
-            if event == "end":
-                if curr_elem == row_node and row is not None:
-                    dicts.append(row)
-                    row = None
-
-                elem.clear()
-                while elem.getprevious() is not None:
-                    del elem.getparent()[0]
-
-        if dicts == []:
-            raise ParserError("No result from selected items in iterparse.")
-
-        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
-        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
-
-        if self.names:
-            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
-
-        return dicts
-
     def _validate_path(self) -> None:
 
         msg = (
@@ -1040,7 +887,8 @@ def read_xml(
 
     names :  list-like, optional
         Column names for DataFrame of parsed XML data. Use this parameter to
-        rename original element names and distinguish same named elements.
+        rename original element names and distinguish same named elements and
+        attributes.
 
     dtype : Type name or dict of column -> type, optional
         Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,

From 3c241d5c823cf94c68c3f043ac3edfc08746aee5 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Sun, 19 Jun 2022 11:27:07 -0500
Subject: [PATCH 3/5] Add lxml preceding-sibling iterparse cleanup

---
 pandas/io/xml.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index d97e840524116..570843987e022 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -359,6 +359,9 @@ def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]:
                     row = None
 
                 elem.clear()
+                if hasattr(elem, "getprevious"):
+                    while elem.getprevious() is not None:
+                        del elem.getparent()[0]
 
         if dicts == []:
             raise ParserError("No result from selected items in iterparse.")

From d73b32f80407964016a6e4cc9c4c5d4fc972c1d7 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Sun, 19 Jun 2022 20:38:10 -0500
Subject: [PATCH 4/5] Revert code refactoring back to bug fix only

---
 pandas/io/xml.py | 366 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 258 insertions(+), 108 deletions(-)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 570843987e022..78fbeaad09300 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -177,7 +177,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
 
         raise AbstractMethodError(self)
 
-    def _parse_nodes(self, elems) -> list[dict[str, str | None]]:
+    def _parse_nodes(self) -> list[dict[str, str | None]]:
         """
         Parse xml nodes.
 
@@ -197,6 +197,102 @@ def _parse_nodes(self, elems) -> list[dict[str, str | None]]:
         will have optional keys filled with None values.
         """
 
+        raise AbstractMethodError(self)
+
+    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
+        """
+        Iterparse xml nodes.
+
+        This method will read in local disk, decompressed XML files for elements
+        and underlying descendants using iterparse, a method to iterate through
+        an XML tree without holding entire XML tree in memory.
+
+        Raises
+        ------
+        TypeError
+            * If `iterparse` is not a dict or its dict value is not list-like.
+        ParserError
+            * If `path_or_buffer` is not a physical, decompressed file on disk.
+            * If no data is returned from selected items in `iterparse`.
+
+        Notes
+        -----
+        Namespace URIs will be removed from return node values. Also,
+        elements with missing children or attributes in submitted list
+        will have optional keys filled with None values.
+        """
+
+        raise AbstractMethodError(self)
+
+    def _validate_path(self) -> None:
+        """
+        Validate xpath.
+
+        This method checks for syntax, evaluation, or empty nodes return.
+
+        Raises
+        ------
+        SyntaxError
+            * If xpah is not supported or issues with namespaces.
+
+        ValueError
+            * If xpah does not return any nodes.
+        """
+
+        raise AbstractMethodError(self)
+
+    def _validate_names(self) -> None:
+        """
+        Validate names.
+
+        This method will check if names is a list-like and aligns
+        with length of parse nodes.
+
+        Raises
+        ------
+        ValueError
+            * If value is not a list and less then length of nodes.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_doc(self, raw_doc) -> bytes:
+        """
+        Build tree from path_or_buffer.
+
+        This method will parse XML object into tree
+        either from string/bytes or file location.
+        """
+        raise AbstractMethodError(self)
+
+
+class _EtreeFrameParser(_XMLFrameParser):
+    """
+    Internal class to parse XML into DataFrames with the Python
+    standard library XML module: `xml.etree.ElementTree`.
+    """
+
+    def parse_data(self) -> list[dict[str, str | None]]:
+        from xml.etree.ElementTree import XML
+
+        if self.stylesheet is not None:
+            raise ValueError(
+                "To use stylesheet, you need lxml installed and selected as parser."
+            )
+
+        if self.iterparse is None:
+            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+            self._validate_path()
+
+        self._validate_names()
+
+        xml_dicts: list[dict[str, str | None]] = (
+            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
+        )
+
+        return xml_dicts
+
+    def _parse_nodes(self) -> list[dict[str, str | None]]:
+        elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
         dicts: list[dict[str, str | None]]
 
         if self.elems_only and self.attrs_only:
@@ -279,28 +375,8 @@ def _parse_nodes(self, elems) -> list[dict[str, str | None]]:
 
         return dicts
 
-    def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]:
-        """
-        Iterparse xml nodes.
-
-        This method will read in local disk, decompressed XML files for elements
-        and underlying descendants using iterparse, a method to iterate through
-        an XML tree without holding entire XML tree in memory.
-
-        Raises
-        ------
-        TypeError
-            * If `iterparse` is not a dict or its dict value is not list-like.
-        ParserError
-            * If `path_or_buffer` is not a physical, decompressed file on disk.
-            * If no data is returned from selected items in `iterparse`.
-
-        Notes
-        -----
-        Namespace URIs will be removed from return node values. Also,
-        elements with missing children or attributes in submitted list
-        will have optional keys filled with None values.
-        """
+    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
+        from xml.etree.ElementTree import iterparse
 
         dicts: list[dict[str, str | None]] = []
         row: dict[str, str | None] | None = None
@@ -357,11 +433,7 @@ def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]:
                 if curr_elem == row_node and row is not None:
                     dicts.append(row)
                     row = None
-
                 elem.clear()
-                if hasattr(elem, "getprevious"):
-                    while elem.getprevious() is not None:
-                        del elem.getparent()[0]
 
         if dicts == []:
             raise ParserError("No result from selected items in iterparse.")
@@ -374,79 +446,6 @@ def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]:
 
         return dicts
 
-    def _validate_path(self) -> None:
-        """
-        Validate xpath.
-
-        This method checks for syntax, evaluation, or empty nodes return.
-
-        Raises
-        ------
-        SyntaxError
-            * If xpah is not supported or issues with namespaces.
-
-        ValueError
-            * If xpah does not return any nodes.
-        """
-
-        raise AbstractMethodError(self)
-
-    def _validate_names(self) -> None:
-        """
-        Validate names.
-
-        This method will check if names is a list-like and aligns
-        with length of parse nodes.
-
-        Raises
-        ------
-        ValueError
-            * If value is not a list and less then length of nodes.
-        """
-        raise AbstractMethodError(self)
-
-    def _parse_doc(self, raw_doc) -> bytes:
-        """
-        Build tree from path_or_buffer.
-
-        This method will parse XML object into tree
-        either from string/bytes or file location.
-        """
-        raise AbstractMethodError(self)
-
-
-class _EtreeFrameParser(_XMLFrameParser):
-    """
-    Internal class to parse XML into DataFrames with the Python
-    standard library XML module: `xml.etree.ElementTree`.
-    """
-
-    def parse_data(self) -> list[dict[str, str | None]]:
-        from xml.etree.ElementTree import (
-            XML,
-            iterparse,
-        )
-
-        if self.stylesheet is not None:
-            raise ValueError(
-                "To use stylesheet, you need lxml installed and selected as parser."
-            )
-
-        if self.iterparse is None:
-            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
-            self._validate_path()
-            elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
-
-        self._validate_names()
-
-        xml_dicts: list[dict[str, str | None]] = (
-            self._parse_nodes(elems)
-            if self.iterparse is None
-            else self._iterparse_nodes(iterparse)
-        )
-
-        return xml_dicts
-
     def _validate_path(self) -> None:
         """
         Notes
@@ -532,10 +531,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
         validate xpath, names, optionally parse and run XSLT,
         and parse original or transformed XML and return specific nodes.
         """
-        from lxml.etree import (
-            XML,
-            iterparse,
-        )
+        from lxml.etree import XML
 
         if self.iterparse is None:
             self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
@@ -545,18 +541,172 @@ def parse_data(self) -> list[dict[str, str | None]]:
                 self.xml_doc = XML(self._transform_doc())
 
             self._validate_path()
-            elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
 
         self._validate_names()
 
         xml_dicts: list[dict[str, str | None]] = (
-            self._parse_nodes(elems)
-            if self.iterparse is None
-            else self._iterparse_nodes(iterparse)
+            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
         )
 
         return xml_dicts
 
+    def _parse_nodes(self) -> list[dict[str, str | None]]:
+        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
+        dicts: list[dict[str, str | None]]
+
+        if self.elems_only and self.attrs_only:
+            raise ValueError("Either element or attributes can be parsed not both.")
+
+        elif self.elems_only:
+            if self.names:
+                dicts = [
+                    {
+                        **(
+                            {el.tag: el.text.strip()}
+                            if el.text and not el.text.isspace()
+                            else {}
+                        ),
+                        **{
+                            nm: ch.text.strip() if ch.text else None
+                            for nm, ch in zip(self.names, el.xpath("*"))
+                        },
+                    }
+                    for el in elems
+                ]
+            else:
+                dicts = [
+                    {
+                        ch.tag: ch.text.strip() if ch.text else None
+                        for ch in el.xpath("*")
+                    }
+                    for el in elems
+                ]
+
+        elif self.attrs_only:
+            dicts = [el.attrib for el in elems]
+
+        else:
+            if self.names:
+                dicts = [
+                    {
+                        **el.attrib,
+                        **(
+                            {el.tag: el.text.strip()}
+                            if el.text and not el.text.isspace()
+                            else {}
+                        ),
+                        **{
+                            nm: ch.text.strip() if ch.text else None
+                            for nm, ch in zip(self.names, el.xpath("*"))
+                        },
+                    }
+                    for el in elems
+                ]
+            else:
+                dicts = [
+                    {
+                        **el.attrib,
+                        **(
+                            {el.tag: el.text.strip()}
+                            if el.text and not el.text.isspace()
+                            else {}
+                        ),
+                        **{
+                            ch.tag: ch.text.strip() if ch.text else None
+                            for ch in el.xpath("*")
+                        },
+                    }
+                    for el in elems
+                ]
+
+        if self.namespaces or "}" in list(dicts[0].keys())[0]:
+            dicts = [
+                {k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
+                for d in dicts
+            ]
+
+        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
+        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
+
+        if self.names:
+            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
+
+        return dicts
+
+    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
+        from lxml.etree import iterparse
+
+        dicts: list[dict[str, str | None]] = []
+        row: dict[str, str | None] | None = None
+
+        if not isinstance(self.iterparse, dict):
+            raise TypeError(
+                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
+            )
+
+        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
+        if not is_list_like(self.iterparse[row_node]):
+            raise TypeError(
+                f"{type(self.iterparse[row_node])} is not a valid type "
+                "for value in iterparse"
+            )
+
+        if (
+            not isinstance(self.path_or_buffer, str)
+            or is_url(self.path_or_buffer)
+            or is_fsspec_url(self.path_or_buffer)
+            or self.path_or_buffer.startswith(("<?xml", "<"))
+            or infer_compression(self.path_or_buffer, "infer") is not None
+        ):
+            raise ParserError(
+                "iterparse is designed for large XML files that are fully extracted on "
+                "local disk and not as compressed files or online sources."
+            )
+
+        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
+            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
+
+            if event == "start":
+                if curr_elem == row_node:
+                    row = {}
+
+            if row is not None:
+                if self.names:
+                    for col, nm in zip(self.iterparse[row_node], self.names):
+                        if curr_elem == col:
+                            elem_val = elem.text.strip() if elem.text else None
+                            if elem_val not in row.values() and nm not in row:
+                                row[nm] = elem_val
+                        if col in elem.attrib:
+                            if elem.attrib[col] not in row.values() and nm not in row:
+                                row[nm] = elem.attrib[col]
+                else:
+                    for col in self.iterparse[row_node]:
+                        if curr_elem == col:
+                            row[col] = elem.text.strip() if elem.text else None
+                        if col in elem.attrib:
+                            row[col] = elem.attrib[col]
+
+            if event == "end":
+                if curr_elem == row_node and row is not None:
+                    dicts.append(row)
+                    row = None
+
+                elem.clear()
+                while elem.getprevious() is not None:
+                    del elem.getparent()[0]
+
+        if dicts == []:
+            raise ParserError("No result from selected items in iterparse.")
+
+        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
+        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
+
+        if self.names:
+            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
+
+        return dicts
+
     def _validate_path(self) -> None:
 
         msg = (

From 35bc554d2dabe98fa0a4f7ac6073574b24f513f3 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Tue, 21 Jun 2022 17:07:24 -0500
Subject: [PATCH 5/5] Remove whatsnew bug fix note on unreleased version
 feature

---
 doc/source/whatsnew/v1.5.0.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 75a51cfbc506a..76f6e864a174f 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -874,7 +874,6 @@ I/O
 - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
 - Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`)
 - Bug in :meth:`DataFrame.to_excel` when writing an empty dataframe with :class:`MultiIndex` (:issue:`19543`)
-- Bug in :func:`read_xml` when reading XML with duplicate element and attribute names (:issue:`47343`)
 
 Period
 ^^^^^^