diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 7a7e518e1f7db..15b3b894c68b6 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
However, if XPath does not reference node names such as default, ``/*``, then
``namespaces`` is not required.
+.. note::
+
+ Since ``xpath`` identifies the parent of content to be parsed, only immediate
+ desendants which include child nodes or current attributes are parsed.
+ Therefore, ``read_xml`` will not parse the text of grandchildren or other
+ descendants and will not parse attributes of any descendant. To retrieve
+ lower level content, adjust xpath to lower level. For example,
+
+ .. ipython:: python
+ :okwarning:
+
+ xml = """
+
+
+ square
+ 360
+
+
+ circle
+ 360
+
+
+ triangle
+ 180
+
+ """
+
+ df = pd.read_xml(xml, xpath="./row")
+ df
+
+ shows the attribute ``sides`` on ``shape`` element was not parsed as
+ expected since this attribute resides on the child of ``row`` element
+ and not ``row`` element itself. In other words, ``sides`` attribute is a
+ grandchild level descendant of ``row`` element. However, the ``xpath``
+ targets ``row`` element which covers only its children and attributes.
+
With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
script which also can be string/file/URL types. As background, `XSLT`_ is
a special-purpose language written in a special XML file that can transform
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index fbe3e41be88a9..71d19b7861fc2 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -387,7 +387,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
return dicts
- def _validate_path(self) -> None:
+ def _validate_path(self) -> list[Any]:
"""
Validate xpath.
@@ -446,8 +446,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
if self.iterparse is None:
self.xml_doc = self._parse_doc(self.path_or_buffer)
- self._validate_path()
- elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
+ elems = self._validate_path()
self._validate_names()
@@ -459,7 +458,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
return xml_dicts
- def _validate_path(self) -> None:
+ def _validate_path(self) -> list[Any]:
"""
Notes
-----
@@ -468,18 +467,28 @@ def _validate_path(self) -> None:
"""
msg = (
- "xpath does not return any nodes. "
+ "xpath does not return any nodes or attributes. "
+ "Be sure to specify in `xpath` the parent nodes of "
+ "children and attributes to parse. "
"If document uses namespaces denoted with "
"xmlns, be sure to define namespaces and "
"use them in xpath."
)
try:
- elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
+ elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
+ children = [ch for el in elems for ch in el.findall("*")]
+ attrs = {k: v for el in elems for k, v in el.attrib.items()}
+
if elems is None:
raise ValueError(msg)
- if elems is not None and elems.find("*") is None and elems.attrib is None:
- raise ValueError(msg)
+ if elems is not None:
+ if self.elems_only and children == []:
+ raise ValueError(msg)
+ elif self.attrs_only and attrs == {}:
+ raise ValueError(msg)
+ elif children == [] and attrs == {}:
+ raise ValueError(msg)
except (KeyError, SyntaxError):
raise SyntaxError(
@@ -488,6 +497,8 @@ def _validate_path(self) -> None:
"undeclared namespace prefix."
)
+ return elems
+
def _validate_names(self) -> None:
children: list[Any]
@@ -554,8 +565,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
self.xsl_doc = self._parse_doc(self.stylesheet)
self.xml_doc = self._transform_doc()
- self._validate_path()
- elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
+ elems = self._validate_path()
self._validate_names()
@@ -567,25 +577,33 @@ def parse_data(self) -> list[dict[str, str | None]]:
return xml_dicts
- def _validate_path(self) -> None:
+ def _validate_path(self) -> list[Any]:
msg = (
- "xpath does not return any nodes. "
- "Be sure row level nodes are in xpath. "
+ "xpath does not return any nodes or attributes. "
+ "Be sure to specify in `xpath` the parent nodes of "
+ "children and attributes to parse. "
"If document uses namespaces denoted with "
"xmlns, be sure to define namespaces and "
"use them in xpath."
)
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
- children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
- attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
+ children = [ch for el in elems for ch in el.xpath("*")]
+ attrs = {k: v for el in elems for k, v in el.attrib.items()}
if elems == []:
raise ValueError(msg)
- if elems != [] and attrs == [] and children == []:
- raise ValueError(msg)
+ if elems != []:
+ if self.elems_only and children == []:
+ raise ValueError(msg)
+ elif self.attrs_only and attrs == {}:
+ raise ValueError(msg)
+ elif children == [] and attrs == {}:
+ raise ValueError(msg)
+
+ return elems
def _validate_names(self) -> None:
children: list[Any]
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index fd4ba87bd302c..935a44d0e0901 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -760,6 +760,45 @@ def test_elem_and_attrs_only(datapath, parser):
read_xml(filename, elems_only=True, attrs_only=True, parser=parser)
+def test_empty_attrs_only(parser):
+ xml = """
+
+
+ square
+ 360
+
+
+ circle
+ 360
+
+
+ triangle
+ 180
+
+ """
+
+ with pytest.raises(
+ ValueError,
+ match=("xpath does not return any nodes or attributes"),
+ ):
+ read_xml(xml, xpath="./row", attrs_only=True, parser=parser)
+
+
+def test_empty_elems_only(parser):
+ xml = """
+
+
+
+
+ """
+
+ with pytest.raises(
+ ValueError,
+ match=("xpath does not return any nodes or attributes"),
+ ):
+ read_xml(xml, xpath="./row", elems_only=True, parser=parser)
+
+
@td.skip_if_no("lxml")
def test_attribute_centric_xml():
xml = """\