diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7a7e518e1f7db..15b3b894c68b6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes. However, if XPath does not reference node names such as default, ``/*``, then ``namespaces`` is not required. +.. note:: + + Since ``xpath`` identifies the parent of content to be parsed, only immediate + desendants which include child nodes or current attributes are parsed. + Therefore, ``read_xml`` will not parse the text of grandchildren or other + descendants and will not parse attributes of any descendant. To retrieve + lower level content, adjust xpath to lower level. For example, + + .. ipython:: python + :okwarning: + + xml = """ + + + square + 360 + + + circle + 360 + + + triangle + 180 + + """ + + df = pd.read_xml(xml, xpath="./row") + df + + shows the attribute ``sides`` on ``shape`` element was not parsed as + expected since this attribute resides on the child of ``row`` element + and not ``row`` element itself. In other words, ``sides`` attribute is a + grandchild level descendant of ``row`` element. However, the ``xpath`` + targets ``row`` element which covers only its children and attributes. + With `lxml`_ as parser, you can flatten nested XML documents with an XSLT script which also can be string/file/URL types. As background, `XSLT`_ is a special-purpose language written in a special XML file that can transform diff --git a/pandas/io/xml.py b/pandas/io/xml.py index fbe3e41be88a9..71d19b7861fc2 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -387,7 +387,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: return dicts - def _validate_path(self) -> None: + def _validate_path(self) -> list[Any]: """ Validate xpath. @@ -446,8 +446,7 @@ def parse_data(self) -> list[dict[str, str | None]]: if self.iterparse is None: self.xml_doc = self._parse_doc(self.path_or_buffer) - self._validate_path() - elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + elems = self._validate_path() self._validate_names() @@ -459,7 +458,7 @@ def parse_data(self) -> list[dict[str, str | None]]: return xml_dicts - def _validate_path(self) -> None: + def _validate_path(self) -> list[Any]: """ Notes ----- @@ -468,18 +467,28 @@ def _validate_path(self) -> None: """ msg = ( - "xpath does not return any nodes. " + "xpath does not return any nodes or attributes. " + "Be sure to specify in `xpath` the parent nodes of " + "children and attributes to parse. " "If document uses namespaces denoted with " "xmlns, be sure to define namespaces and " "use them in xpath." ) try: - elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + children = [ch for el in elems for ch in el.findall("*")] + attrs = {k: v for el in elems for k, v in el.attrib.items()} + if elems is None: raise ValueError(msg) - if elems is not None and elems.find("*") is None and elems.attrib is None: - raise ValueError(msg) + if elems is not None: + if self.elems_only and children == []: + raise ValueError(msg) + elif self.attrs_only and attrs == {}: + raise ValueError(msg) + elif children == [] and attrs == {}: + raise ValueError(msg) except (KeyError, SyntaxError): raise SyntaxError( @@ -488,6 +497,8 @@ def _validate_path(self) -> None: "undeclared namespace prefix." ) + return elems + def _validate_names(self) -> None: children: list[Any] @@ -554,8 +565,7 @@ def parse_data(self) -> list[dict[str, str | None]]: self.xsl_doc = self._parse_doc(self.stylesheet) self.xml_doc = self._transform_doc() - self._validate_path() - elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + elems = self._validate_path() self._validate_names() @@ -567,25 +577,33 @@ def parse_data(self) -> list[dict[str, str | None]]: return xml_dicts - def _validate_path(self) -> None: + def _validate_path(self) -> list[Any]: msg = ( - "xpath does not return any nodes. " - "Be sure row level nodes are in xpath. " + "xpath does not return any nodes or attributes. " + "Be sure to specify in `xpath` the parent nodes of " + "children and attributes to parse. " "If document uses namespaces denoted with " "xmlns, be sure to define namespaces and " "use them in xpath." ) elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) - children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) - attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) + children = [ch for el in elems for ch in el.xpath("*")] + attrs = {k: v for el in elems for k, v in el.attrib.items()} if elems == []: raise ValueError(msg) - if elems != [] and attrs == [] and children == []: - raise ValueError(msg) + if elems != []: + if self.elems_only and children == []: + raise ValueError(msg) + elif self.attrs_only and attrs == {}: + raise ValueError(msg) + elif children == [] and attrs == {}: + raise ValueError(msg) + + return elems def _validate_names(self) -> None: children: list[Any] diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index fd4ba87bd302c..935a44d0e0901 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -760,6 +760,45 @@ def test_elem_and_attrs_only(datapath, parser): read_xml(filename, elems_only=True, attrs_only=True, parser=parser) +def test_empty_attrs_only(parser): + xml = """ + + + square + 360 + + + circle + 360 + + + triangle + 180 + + """ + + with pytest.raises( + ValueError, + match=("xpath does not return any nodes or attributes"), + ): + read_xml(xml, xpath="./row", attrs_only=True, parser=parser) + + +def test_empty_elems_only(parser): + xml = """ + + + + + """ + + with pytest.raises( + ValueError, + match=("xpath does not return any nodes or attributes"), + ): + read_xml(xml, xpath="./row", elems_only=True, parser=parser) + + @td.skip_if_no("lxml") def test_attribute_centric_xml(): xml = """\