From 48ed43be4697f5fe7b8164377a39434fcbf6e6a2 Mon Sep 17 00:00:00 2001
From: phaebz <phaebz@gmail.com>
Date: Sat, 2 Nov 2013 19:06:39 +0100
Subject: [PATCH 1/6] io.html.read_html support XPath expressions for table
 selection (only lxml parser atm)

---
 pandas/io/html.py            | 67 ++++++++++++++++++++++++------------
 pandas/io/tests/test_html.py | 39 +++++++++++++++++++++
 2 files changed, 84 insertions(+), 22 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index f3cfa3a16807a..a4e2af3b9f0c5 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -165,13 +165,15 @@ class _HtmlFrameParser(object):
     See each method's respective documentation for details on their
     functionality.
     """
-    def __init__(self, io, match, attrs):
+    def __init__(self, io, match, attrs, xpath):
         self.io = io
         self.match = match
         self.attrs = attrs
+        self.xpath = xpath
 
     def parse_tables(self):
-        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+        tables = self._parse_tables(self._build_doc(), self.match, self.attrs,
+                                    self.xpath)
         return (self._build_table(table) for table in tables)
 
     def _parse_raw_data(self, rows):
@@ -227,7 +229,7 @@ def _parse_td(self, obj):
         """
         raise NotImplementedError
 
-    def _parse_tables(self, doc, match, attrs):
+    def _parse_tables(self, doc, match, attrs, xpath):
         """Return all tables from the parsed DOM.
 
         Parameters
@@ -242,6 +244,9 @@ def _parse_tables(self, doc, match, attrs):
             A dictionary of table attributes that can be used to disambiguate
             mutliple tables on a page.
 
+        xpath : str or None
+            An XPath style string used to filter for tables to be returned.
+
         Raises
         ------
         ValueError
@@ -393,7 +398,7 @@ def _parse_tbody(self, table):
     def _parse_tfoot(self, table):
         return table.find_all('tfoot')
 
-    def _parse_tables(self, doc, match, attrs):
+    def _parse_tables(self, doc, match, attrs, xpath):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
 
@@ -481,24 +486,33 @@ def _parse_tr(self, table):
         expr = './/tr[normalize-space()]'
         return table.xpath(expr)
 
-    def _parse_tables(self, doc, match, kwargs):
-        pattern = match.pattern
+    def _parse_tables(self, doc, match, kwargs, xpath):
+        if xpath:
+            xpath_expr = xpath
+            tables = doc.xpath(xpath_expr)
+
+            if not tables:
+                raise ValueError("No tables found using XPath expression %s" % xpath)
+            return tables
 
-        # 1. check all descendants for the given pattern and only search tables
-        # 2. go up the tree until we find a table
-        query = '//table//*[re:test(text(), %r)]/ancestor::table'
-        xpath_expr = u(query) % pattern
+        else:
+            pattern = match.pattern
 
-        # if any table attributes were given build an xpath expression to
-        # search for them
-        if kwargs:
-            xpath_expr += _build_xpath_expr(kwargs)
+            # 1. check all descendants for the given pattern and only search tables
+            # 2. go up the tree until we find a table
+            query = '//table//*[re:test(text(), %r)]/ancestor::table'
+            xpath_expr = u(query) % pattern
 
-        tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
+            # if any table attributes were given build an xpath expression to
+            # search for them
+            if kwargs:
+                xpath_expr += _build_xpath_expr(kwargs)
 
-        if not tables:
-            raise ValueError("No tables found matching regex %r" % pattern)
-        return tables
+            tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
+
+            if not tables:
+                raise ValueError("No tables found matching regex %r" % pattern)
+            return tables
 
     def _build_doc(self):
         """
@@ -688,7 +702,7 @@ def _validate_flavor(flavor):
 
 
 def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
-           parse_dates, tupleize_cols, thousands, attrs):
+           parse_dates, tupleize_cols, thousands, attrs, xpath):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
@@ -696,7 +710,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs)
+        if xpath and flav in ('bs4', 'html5lib'):
+            raise NotImplementedError
+
+        p = parser(io, compiled_match, attrs, xpath)
 
         try:
             tables = p.parse_tables()
@@ -714,7 +731,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
 
 def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, infer_types=None, attrs=None, parse_dates=False,
-              tupleize_cols=False, thousands=','):
+              tupleize_cols=False, thousands=',', xpath=None):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -795,6 +812,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
     thousands : str, optional
         Separator to use to parse thousands. Defaults to ``','``.
 
+    xpath : str or None, optional
+        If not ``None`` try to identify the set of tables to be read by an
+        XPath string; takes precedence over ``match``. Defaults to ``None``.
+        Note: This functionality is not (yet) available with the Beautiful Soup
+        parser (``flavor=bs4``).
+
     Returns
     -------
     dfs : list of DataFrames
@@ -840,4 +863,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
         raise ValueError('cannot skip rows starting from the end of the '
                          'data (you passed a negative value)')
     return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
-                  parse_dates, tupleize_cols, thousands, attrs)
+                  parse_dates, tupleize_cols, thousands, attrs, xpath)
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index c26048d4cf20b..bec0fc8588ede 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -581,12 +581,28 @@ def test_parse_dates_combine(self):
         newdf = DataFrame({'datetime': raw_dates})
         tm.assert_frame_equal(newdf, res[0])
 
+    def test_xpath_bs4_not_implemented(self):
+        with open(self.spam_data) as f:
+            with self.assertRaises(NotImplementedError):
+                self.read_html(f, flavor='bs4',
+                               xpath="//div[@class='garbage']/table")
+
 
 class TestReadHtmlLxml(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         _skip_if_no('lxml')
 
+    def setup_data(self):
+        self.valid_data = os.path.join(DATA_PATH, 'valid_markup.html')
+
+    def setup_flavor(self):
+        self.flavor = 'lxml'
+
+    def setUp(self):
+        self.setup_data()
+        self.setup_flavor()
+
     def read_html(self, *args, **kwargs):
         self.flavor = ['lxml']
         kwargs['flavor'] = kwargs.get('flavor', self.flavor)
@@ -630,6 +646,29 @@ def test_parse_dates_combine(self):
         newdf = DataFrame({'datetime': raw_dates})
         tm.assert_frame_equal(newdf, res[0])
 
+    def test_xpath_file_like(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 xpath="//table[@class='dataframe']")
+
+        tm.assert_isinstance(dfs, list)
+        for df in dfs:
+            tm.assert_isinstance(df, DataFrame)
+
+    @slow
+    def test_xpath_file_url(self):
+        url = self.valid_data
+        dfs = self.read_html(file_path_to_url(url),
+                             xpath="//*[@class='dataframe']")
+        tm.assert_isinstance(dfs, list)
+        for df in dfs:
+            tm.assert_isinstance(df, DataFrame)
+
+    def test_xpath_table_not_found(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(ValueError):
+                self.read_html(f, xpath="//div[@class='garbage']/table")
+
 
 def test_invalid_flavor():
     url = 'google.com'

From de9451242d98b20cbf7e1f98e91d8f0c1f44d5ed Mon Sep 17 00:00:00 2001
From: phaebz <phaebz@gmail.com>
Date: Sat, 2 Nov 2013 21:53:11 +0100
Subject: [PATCH 2/6] Coverage tests for `match` and `attr` parameters

---
 pandas/io/tests/test_html.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index bec0fc8588ede..6de03475815b1 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -646,6 +646,21 @@ def test_parse_dates_combine(self):
         newdf = DataFrame({'datetime': raw_dates})
         tm.assert_frame_equal(newdf, res[0])
 
+    def test_attrs_file_like(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 attrs={'class': 'dataframe'})
+
+        tm.assert_isinstance(dfs, list)
+        for df in dfs:
+            tm.assert_isinstance(df, DataFrame)
+
+    def test_match_no_match(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(ValueError):
+                dfs = self.read_html(f,
+                                     match='supercalifragilistic')
+
     def test_xpath_file_like(self):
         with open(self.valid_data) as f:
             dfs = self.read_html(f,

From 9a300b45f10c51a9805f1d96039bcf10eff0d2c9 Mon Sep 17 00:00:00 2001
From: phaebz <phaebz@gmail.com>
Date: Sun, 3 Nov 2013 13:24:51 +0100
Subject: [PATCH 3/6] XPath expression has to match table elements only

---
 pandas/io/html.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index a4e2af3b9f0c5..f205d55ef923a 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -31,6 +31,7 @@
 
 try:
     import lxml
+    from lxml.etree import XPathEvalError
 except ImportError:
     _HAS_LXML = False
 else:
@@ -491,6 +492,9 @@ def _parse_tables(self, doc, match, kwargs, xpath):
             xpath_expr = xpath
             tables = doc.xpath(xpath_expr)
 
+            if not all(table.tag == 'table' for table in tables):
+                raise ValueError("XPath expression %s matched non-table elements" % xpath)
+
             if not tables:
                 raise ValueError("No tables found using XPath expression %s" % xpath)
             return tables

From 563a95576d93baf959bc8feef17ee5d407d250a9 Mon Sep 17 00:00:00 2001
From: phaebz <phaebz@gmail.com>
Date: Sun, 3 Nov 2013 13:25:23 +0100
Subject: [PATCH 4/6] Further testing for XPath feature

---
 pandas/io/html.py            |  5 ++++-
 pandas/io/tests/test_html.py | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index f205d55ef923a..e0b093a6eedc3 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -31,7 +31,6 @@
 
 try:
     import lxml
-    from lxml.etree import XPathEvalError
 except ImportError:
     _HAS_LXML = False
 else:
@@ -710,6 +709,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
+    if xpath and not _HAS_LXML:
+        raise ValueError("XPath table selection needs the lxml module, "
+                         "please install it.")
+
     # hack around python 3 deleting the exception variable
     retained = None
     for flav in flavor:
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index 6de03475815b1..8fced1fbbda0b 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -18,6 +18,11 @@
 from numpy.random import rand
 from numpy.testing.decorators import slow
 
+try:
+    from lxml.etree import XPathEvalError
+except ImportError:
+    pass
+
 from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
                     date_range, Series)
 from pandas.compat import map, zip, StringIO, string_types
@@ -679,11 +684,36 @@ def test_xpath_file_url(self):
         for df in dfs:
             tm.assert_isinstance(df, DataFrame)
 
-    def test_xpath_table_not_found(self):
+    def test_xpath_direct_ref(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 xpath="//html/body/table[@class='dataframe']"
+                                       "[last()]")
+        assert dfs[0].shape == (2, 3)
+
+    def test_xpath_match_multiple(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 xpath="//*[@class='dataframe']")
+
+        assert len(dfs) == 2
+
+    def test_xpath_match_none(self):
         with open(self.valid_data) as f:
             with self.assertRaises(ValueError):
                 self.read_html(f, xpath="//div[@class='garbage']/table")
 
+    def test_xpath_not_all_tables(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(ValueError):
+                self.read_html(f,
+                               xpath="//tr")
+
+    def test_invalid_xpath(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(XPathEvalError):
+                self.read_html(f, xpath="//div[@@class=garbage]/table")
+
 
 def test_invalid_flavor():
     url = 'google.com'

From 432269b86f4ab63c6b4abe3f29faf400f5dc80be Mon Sep 17 00:00:00 2001
From: phaebz <phaebz@gmail.com>
Date: Thu, 7 Nov 2013 19:20:31 +0100
Subject: [PATCH 5/6] Correct format specifiers for XPath string formatting

---
 pandas/io/html.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index e0b093a6eedc3..c7e7210247c8d 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -492,10 +492,10 @@ def _parse_tables(self, doc, match, kwargs, xpath):
             tables = doc.xpath(xpath_expr)
 
             if not all(table.tag == 'table' for table in tables):
-                raise ValueError("XPath expression %s matched non-table elements" % xpath)
+                raise ValueError("XPath expression %r matched non-table elements" % xpath)
 
             if not tables:
-                raise ValueError("No tables found using XPath expression %s" % xpath)
+                raise ValueError("No tables found using XPath expression %r" % xpath)
             return tables
 
         else:

From c2bcbc90a957e17a982bfd55297934c55075b905 Mon Sep 17 00:00:00 2001
From: phaebz <phaebz@gmail.com>
Date: Thu, 7 Nov 2013 19:23:56 +0100
Subject: [PATCH 6/6] Release notes addition

---
 doc/source/release.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/source/release.rst b/doc/source/release.rst
index 6e10bd651d90a..c779b25b30444 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -52,6 +52,8 @@ pandas 0.13.0
 New features
 ~~~~~~~~~~~~
 
+  - ``read_html()`` now accepts an ``xpath`` string argument representing an
+    xpath expression used for selecting tables to be read (:issue:`5416`)
   - ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and
     ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set
     the bandwidth, and to gkde.evaluate() to specify the indicies at which it