From 4aa32cc3ff382843d0d3a5c56402c70e20b3b63f Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Fri, 17 Jun 2022 23:16:04 -0500 Subject: [PATCH 1/5] BUG: iterparse of read_xml not parsing duplicate element and attribute names --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/xml.py | 40 ++++++++++++++++++++++++--------- pandas/tests/io/xml/test_xml.py | 35 +++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 76f6e864a174f..75a51cfbc506a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -874,6 +874,7 @@ I/O - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) - Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`) - Bug in :meth:`DataFrame.to_excel` when writing an empty dataframe with :class:`MultiIndex` (:issue:`19543`) +- Bug in :func:`read_xml` when reading XML with duplicate element and attribute names (:issue:`47343`) Period ^^^^^^ diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 181b0fe115f4c..2c6b20058122b 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -413,11 +413,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: row = {} if row is not None: - for col in self.iterparse[row_node]: - if curr_elem == col: - row[col] = elem.text.strip() if elem.text else None - if col in elem.attrib: - row[col] = elem.attrib[col] + if self.names: + for col, nm in zip(self.iterparse[row_node], self.names): + if curr_elem == col: + elem_val = elem.text.strip() if elem.text else None + if elem_val not in row.values() and nm not in row: + row[nm] = elem_val + if col in elem.attrib: + if elem.attrib[col] not in row.values() and nm not in row: + row[nm] = elem.attrib[col] + else: + for col in self.iterparse[row_node]: + if curr_elem == col: + row[col] = elem.text.strip() if elem.text else None + if col in elem.attrib: + row[col] = elem.attrib[col] if event == "end": if curr_elem == row_node and row is not None: @@ -661,11 +671,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: row = {} if row is not None: - for col in self.iterparse[row_node]: - if curr_elem == col: - row[col] = elem.text.strip() if elem.text else None - if col in elem.attrib: - row[col] = elem.attrib[col] + if self.names: + for col, nm in zip(self.iterparse[row_node], self.names): + if curr_elem == col: + elem_val = elem.text.strip() if elem.text else None + if elem_val not in row.values() and nm not in row: + row[nm] = elem_val + if col in elem.attrib: + if elem.attrib[col] not in row.values() and nm not in row: + row[nm] = elem.attrib[col] + else: + for col in self.iterparse[row_node]: + if curr_elem == col: + row[col] = elem.text.strip() if elem.text else None + if col in elem.attrib: + row[col] = elem.attrib[col] if event == "end": if curr_elem == row_node and row is not None: diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 277b6442a0a8c..eb2230bbf7fd5 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -789,6 +789,41 @@ def test_names_option_output(datapath, parser): tm.assert_frame_equal(df_iter, df_expected) +def test_repeat_names(parser): + xml = """\ + + + circle + curved + + + sphere + curved + +""" + df_xpath = read_xml( + xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"] + ) + + df_iter = read_xml_iterparse( + xml, + parser=parser, + iterparse={"shape": ["type", "name", "type"]}, + names=["type_dim", "shape", "type_edge"], + ) + + df_expected = DataFrame( + { + "type_dim": ["2D", "3D"], + "shape": ["circle", "sphere"], + "type_edge": ["curved", "curved"], + } + ) + + tm.assert_frame_equal(df_xpath, df_expected) + tm.assert_frame_equal(df_iter, df_expected) + + def test_names_option_wrong_length(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") From 65c09d57d664ab0e9d8e0570179121db9c7a0e1f Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 18 Jun 2022 16:49:52 -0500 Subject: [PATCH 2/5] Refactor duplicative code in each parser to shared base class --- pandas/io/xml.py | 366 ++++++++++++++--------------------------------- 1 file changed, 107 insertions(+), 259 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 2c6b20058122b..d97e840524116 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -177,7 +177,7 @@ def parse_data(self) -> list[dict[str, str | None]]: raise AbstractMethodError(self) - def _parse_nodes(self) -> list[dict[str, str | None]]: + def _parse_nodes(self, elems) -> list[dict[str, str | None]]: """ Parse xml nodes. @@ -197,102 +197,6 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: will have optional keys filled with None values. """ - raise AbstractMethodError(self) - - def _iterparse_nodes(self) -> list[dict[str, str | None]]: - """ - Iterparse xml nodes. - - This method will read in local disk, decompressed XML files for elements - and underlying descendants using iterparse, a method to iterate through - an XML tree without holding entire XML tree in memory. - - Raises - ------ - TypeError - * If `iterparse` is not a dict or its dict value is not list-like. - ParserError - * If `path_or_buffer` is not a physical, decompressed file on disk. - * If no data is returned from selected items in `iterparse`. - - Notes - ----- - Namespace URIs will be removed from return node values. Also, - elements with missing children or attributes in submitted list - will have optional keys filled with None values. - """ - - raise AbstractMethodError(self) - - def _validate_path(self) -> None: - """ - Validate xpath. - - This method checks for syntax, evaluation, or empty nodes return. - - Raises - ------ - SyntaxError - * If xpah is not supported or issues with namespaces. - - ValueError - * If xpah does not return any nodes. - """ - - raise AbstractMethodError(self) - - def _validate_names(self) -> None: - """ - Validate names. - - This method will check if names is a list-like and aligns - with length of parse nodes. - - Raises - ------ - ValueError - * If value is not a list and less then length of nodes. - """ - raise AbstractMethodError(self) - - def _parse_doc(self, raw_doc) -> bytes: - """ - Build tree from path_or_buffer. - - This method will parse XML object into tree - either from string/bytes or file location. - """ - raise AbstractMethodError(self) - - -class _EtreeFrameParser(_XMLFrameParser): - """ - Internal class to parse XML into DataFrames with the Python - standard library XML module: `xml.etree.ElementTree`. - """ - - def parse_data(self) -> list[dict[str, str | None]]: - from xml.etree.ElementTree import XML - - if self.stylesheet is not None: - raise ValueError( - "To use stylesheet, you need lxml installed and selected as parser." - ) - - if self.iterparse is None: - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) - self._validate_path() - - self._validate_names() - - xml_dicts: list[dict[str, str | None]] = ( - self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() - ) - - return xml_dicts - - def _parse_nodes(self) -> list[dict[str, str | None]]: - elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) dicts: list[dict[str, str | None]] if self.elems_only and self.attrs_only: @@ -375,8 +279,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: return dicts - def _iterparse_nodes(self) -> list[dict[str, str | None]]: - from xml.etree.ElementTree import iterparse + def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]: + """ + Iterparse xml nodes. + + This method will read in local disk, decompressed XML files for elements + and underlying descendants using iterparse, a method to iterate through + an XML tree without holding entire XML tree in memory. + + Raises + ------ + TypeError + * If `iterparse` is not a dict or its dict value is not list-like. + ParserError + * If `path_or_buffer` is not a physical, decompressed file on disk. + * If no data is returned from selected items in `iterparse`. + + Notes + ----- + Namespace URIs will be removed from return node values. Also, + elements with missing children or attributes in submitted list + will have optional keys filled with None values. + """ dicts: list[dict[str, str | None]] = [] row: dict[str, str | None] | None = None @@ -433,6 +357,7 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: if curr_elem == row_node and row is not None: dicts.append(row) row = None + elem.clear() if dicts == []: @@ -446,6 +371,79 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: return dicts + def _validate_path(self) -> None: + """ + Validate xpath. + + This method checks for syntax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + ValueError + * If xpah does not return any nodes. + """ + + raise AbstractMethodError(self) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list-like and aligns + with length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + raise AbstractMethodError(self) + + def _parse_doc(self, raw_doc) -> bytes: + """ + Build tree from path_or_buffer. + + This method will parse XML object into tree + either from string/bytes or file location. + """ + raise AbstractMethodError(self) + + +class _EtreeFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into DataFrames with the Python + standard library XML module: `xml.etree.ElementTree`. + """ + + def parse_data(self) -> list[dict[str, str | None]]: + from xml.etree.ElementTree import ( + XML, + iterparse, + ) + + if self.stylesheet is not None: + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." + ) + + if self.iterparse is None: + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + self._validate_path() + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + + self._validate_names() + + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes(elems) + if self.iterparse is None + else self._iterparse_nodes(iterparse) + ) + + return xml_dicts + def _validate_path(self) -> None: """ Notes @@ -531,7 +529,10 @@ def parse_data(self) -> list[dict[str, str | None]]: validate xpath, names, optionally parse and run XSLT, and parse original or transformed XML and return specific nodes. """ - from lxml.etree import XML + from lxml.etree import ( + XML, + iterparse, + ) if self.iterparse is None: self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) @@ -541,172 +542,18 @@ def parse_data(self) -> list[dict[str, str | None]]: self.xml_doc = XML(self._transform_doc()) self._validate_path() + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) self._validate_names() xml_dicts: list[dict[str, str | None]] = ( - self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() + self._parse_nodes(elems) + if self.iterparse is None + else self._iterparse_nodes(iterparse) ) return xml_dicts - def _parse_nodes(self) -> list[dict[str, str | None]]: - elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) - dicts: list[dict[str, str | None]] - - if self.elems_only and self.attrs_only: - raise ValueError("Either element or attributes can be parsed not both.") - - elif self.elems_only: - if self.names: - dicts = [ - { - **( - {el.tag: el.text.strip()} - if el.text and not el.text.isspace() - else {} - ), - **{ - nm: ch.text.strip() if ch.text else None - for nm, ch in zip(self.names, el.xpath("*")) - }, - } - for el in elems - ] - else: - dicts = [ - { - ch.tag: ch.text.strip() if ch.text else None - for ch in el.xpath("*") - } - for el in elems - ] - - elif self.attrs_only: - dicts = [el.attrib for el in elems] - - else: - if self.names: - dicts = [ - { - **el.attrib, - **( - {el.tag: el.text.strip()} - if el.text and not el.text.isspace() - else {} - ), - **{ - nm: ch.text.strip() if ch.text else None - for nm, ch in zip(self.names, el.xpath("*")) - }, - } - for el in elems - ] - else: - dicts = [ - { - **el.attrib, - **( - {el.tag: el.text.strip()} - if el.text and not el.text.isspace() - else {} - ), - **{ - ch.tag: ch.text.strip() if ch.text else None - for ch in el.xpath("*") - }, - } - for el in elems - ] - - if self.namespaces or "}" in list(dicts[0].keys())[0]: - dicts = [ - {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} - for d in dicts - ] - - keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) - dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] - - if self.names: - dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] - - return dicts - - def _iterparse_nodes(self) -> list[dict[str, str | None]]: - from lxml.etree import iterparse - - dicts: list[dict[str, str | None]] = [] - row: dict[str, str | None] | None = None - - if not isinstance(self.iterparse, dict): - raise TypeError( - f"{type(self.iterparse).__name__} is not a valid type for iterparse" - ) - - row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" - if not is_list_like(self.iterparse[row_node]): - raise TypeError( - f"{type(self.iterparse[row_node])} is not a valid type " - "for value in iterparse" - ) - - if ( - not isinstance(self.path_or_buffer, str) - or is_url(self.path_or_buffer) - or is_fsspec_url(self.path_or_buffer) - or self.path_or_buffer.startswith((" None: msg = ( @@ -1040,7 +887,8 @@ def read_xml( names : list-like, optional Column names for DataFrame of parsed XML data. Use this parameter to - rename original element names and distinguish same named elements. + rename original element names and distinguish same named elements and + attributes. dtype : Type name or dict of column -> type, optional Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, From 3c241d5c823cf94c68c3f043ac3edfc08746aee5 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 19 Jun 2022 11:27:07 -0500 Subject: [PATCH 3/5] Add lxml preceding-sibling iterparse cleanup --- pandas/io/xml.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index d97e840524116..570843987e022 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -359,6 +359,9 @@ def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]: row = None elem.clear() + if hasattr(elem, "getprevious"): + while elem.getprevious() is not None: + del elem.getparent()[0] if dicts == []: raise ParserError("No result from selected items in iterparse.") From d73b32f80407964016a6e4cc9c4c5d4fc972c1d7 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 19 Jun 2022 20:38:10 -0500 Subject: [PATCH 4/5] Revert code refactoring back to bug fix only --- pandas/io/xml.py | 366 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 258 insertions(+), 108 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 570843987e022..78fbeaad09300 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -177,7 +177,7 @@ def parse_data(self) -> list[dict[str, str | None]]: raise AbstractMethodError(self) - def _parse_nodes(self, elems) -> list[dict[str, str | None]]: + def _parse_nodes(self) -> list[dict[str, str | None]]: """ Parse xml nodes. @@ -197,6 +197,102 @@ def _parse_nodes(self, elems) -> list[dict[str, str | None]]: will have optional keys filled with None values. """ + raise AbstractMethodError(self) + + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + """ + Iterparse xml nodes. + + This method will read in local disk, decompressed XML files for elements + and underlying descendants using iterparse, a method to iterate through + an XML tree without holding entire XML tree in memory. + + Raises + ------ + TypeError + * If `iterparse` is not a dict or its dict value is not list-like. + ParserError + * If `path_or_buffer` is not a physical, decompressed file on disk. + * If no data is returned from selected items in `iterparse`. + + Notes + ----- + Namespace URIs will be removed from return node values. Also, + elements with missing children or attributes in submitted list + will have optional keys filled with None values. + """ + + raise AbstractMethodError(self) + + def _validate_path(self) -> None: + """ + Validate xpath. + + This method checks for syntax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + ValueError + * If xpah does not return any nodes. + """ + + raise AbstractMethodError(self) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list-like and aligns + with length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + raise AbstractMethodError(self) + + def _parse_doc(self, raw_doc) -> bytes: + """ + Build tree from path_or_buffer. + + This method will parse XML object into tree + either from string/bytes or file location. + """ + raise AbstractMethodError(self) + + +class _EtreeFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into DataFrames with the Python + standard library XML module: `xml.etree.ElementTree`. + """ + + def parse_data(self) -> list[dict[str, str | None]]: + from xml.etree.ElementTree import XML + + if self.stylesheet is not None: + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." + ) + + if self.iterparse is None: + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + self._validate_path() + + self._validate_names() + + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() + ) + + return xml_dicts + + def _parse_nodes(self) -> list[dict[str, str | None]]: + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) dicts: list[dict[str, str | None]] if self.elems_only and self.attrs_only: @@ -279,28 +375,8 @@ def _parse_nodes(self, elems) -> list[dict[str, str | None]]: return dicts - def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]: - """ - Iterparse xml nodes. - - This method will read in local disk, decompressed XML files for elements - and underlying descendants using iterparse, a method to iterate through - an XML tree without holding entire XML tree in memory. - - Raises - ------ - TypeError - * If `iterparse` is not a dict or its dict value is not list-like. - ParserError - * If `path_or_buffer` is not a physical, decompressed file on disk. - * If no data is returned from selected items in `iterparse`. - - Notes - ----- - Namespace URIs will be removed from return node values. Also, - elements with missing children or attributes in submitted list - will have optional keys filled with None values. - """ + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + from xml.etree.ElementTree import iterparse dicts: list[dict[str, str | None]] = [] row: dict[str, str | None] | None = None @@ -357,11 +433,7 @@ def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]: if curr_elem == row_node and row is not None: dicts.append(row) row = None - elem.clear() - if hasattr(elem, "getprevious"): - while elem.getprevious() is not None: - del elem.getparent()[0] if dicts == []: raise ParserError("No result from selected items in iterparse.") @@ -374,79 +446,6 @@ def _iterparse_nodes(self, iterparse) -> list[dict[str, str | None]]: return dicts - def _validate_path(self) -> None: - """ - Validate xpath. - - This method checks for syntax, evaluation, or empty nodes return. - - Raises - ------ - SyntaxError - * If xpah is not supported or issues with namespaces. - - ValueError - * If xpah does not return any nodes. - """ - - raise AbstractMethodError(self) - - def _validate_names(self) -> None: - """ - Validate names. - - This method will check if names is a list-like and aligns - with length of parse nodes. - - Raises - ------ - ValueError - * If value is not a list and less then length of nodes. - """ - raise AbstractMethodError(self) - - def _parse_doc(self, raw_doc) -> bytes: - """ - Build tree from path_or_buffer. - - This method will parse XML object into tree - either from string/bytes or file location. - """ - raise AbstractMethodError(self) - - -class _EtreeFrameParser(_XMLFrameParser): - """ - Internal class to parse XML into DataFrames with the Python - standard library XML module: `xml.etree.ElementTree`. - """ - - def parse_data(self) -> list[dict[str, str | None]]: - from xml.etree.ElementTree import ( - XML, - iterparse, - ) - - if self.stylesheet is not None: - raise ValueError( - "To use stylesheet, you need lxml installed and selected as parser." - ) - - if self.iterparse is None: - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) - self._validate_path() - elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) - - self._validate_names() - - xml_dicts: list[dict[str, str | None]] = ( - self._parse_nodes(elems) - if self.iterparse is None - else self._iterparse_nodes(iterparse) - ) - - return xml_dicts - def _validate_path(self) -> None: """ Notes @@ -532,10 +531,7 @@ def parse_data(self) -> list[dict[str, str | None]]: validate xpath, names, optionally parse and run XSLT, and parse original or transformed XML and return specific nodes. """ - from lxml.etree import ( - XML, - iterparse, - ) + from lxml.etree import XML if self.iterparse is None: self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) @@ -545,18 +541,172 @@ def parse_data(self) -> list[dict[str, str | None]]: self.xml_doc = XML(self._transform_doc()) self._validate_path() - elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) self._validate_names() xml_dicts: list[dict[str, str | None]] = ( - self._parse_nodes(elems) - if self.iterparse is None - else self._iterparse_nodes(iterparse) + self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() ) return xml_dicts + def _parse_nodes(self) -> list[dict[str, str | None]]: + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + dicts: list[dict[str, str | None]] + + if self.elems_only and self.attrs_only: + raise ValueError("Either element or attributes can be parsed not both.") + + elif self.elems_only: + if self.names: + dicts = [ + { + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.xpath("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + ch.tag: ch.text.strip() if ch.text else None + for ch in el.xpath("*") + } + for el in elems + ] + + elif self.attrs_only: + dicts = [el.attrib for el in elems] + + else: + if self.names: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.xpath("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + ch.tag: ch.text.strip() if ch.text else None + for ch in el.xpath("*") + }, + } + for el in elems + ] + + if self.namespaces or "}" in list(dicts[0].keys())[0]: + dicts = [ + {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} + for d in dicts + ] + + keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) + dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] + + if self.names: + dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] + + return dicts + + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + from lxml.etree import iterparse + + dicts: list[dict[str, str | None]] = [] + row: dict[str, str | None] | None = None + + if not isinstance(self.iterparse, dict): + raise TypeError( + f"{type(self.iterparse).__name__} is not a valid type for iterparse" + ) + + row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" + if not is_list_like(self.iterparse[row_node]): + raise TypeError( + f"{type(self.iterparse[row_node])} is not a valid type " + "for value in iterparse" + ) + + if ( + not isinstance(self.path_or_buffer, str) + or is_url(self.path_or_buffer) + or is_fsspec_url(self.path_or_buffer) + or self.path_or_buffer.startswith((" None: msg = ( From 35bc554d2dabe98fa0a4f7ac6073574b24f513f3 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 21 Jun 2022 17:07:24 -0500 Subject: [PATCH 5/5] Remove whatsnew bug fix note on unreleased version feature --- doc/source/whatsnew/v1.5.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 75a51cfbc506a..76f6e864a174f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -874,7 +874,6 @@ I/O - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) - Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`) - Bug in :meth:`DataFrame.to_excel` when writing an empty dataframe with :class:`MultiIndex` (:issue:`19543`) -- Bug in :func:`read_xml` when reading XML with duplicate element and attribute names (:issue:`47343`) Period ^^^^^^