From 48ed43be4697f5fe7b8164377a39434fcbf6e6a2 Mon Sep 17 00:00:00 2001 From: phaebz Date: Sat, 2 Nov 2013 19:06:39 +0100 Subject: [PATCH 1/6] io.html.read_html support XPath expressions for table selection (only lxml parser atm) --- pandas/io/html.py | 67 ++++++++++++++++++++++++------------ pandas/io/tests/test_html.py | 39 +++++++++++++++++++++ 2 files changed, 84 insertions(+), 22 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index f3cfa3a16807a..a4e2af3b9f0c5 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -165,13 +165,15 @@ class _HtmlFrameParser(object): See each method's respective documentation for details on their functionality. """ - def __init__(self, io, match, attrs): + def __init__(self, io, match, attrs, xpath): self.io = io self.match = match self.attrs = attrs + self.xpath = xpath def parse_tables(self): - tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + tables = self._parse_tables(self._build_doc(), self.match, self.attrs, + self.xpath) return (self._build_table(table) for table in tables) def _parse_raw_data(self, rows): @@ -227,7 +229,7 @@ def _parse_td(self, obj): """ raise NotImplementedError - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, doc, match, attrs, xpath): """Return all tables from the parsed DOM. Parameters @@ -242,6 +244,9 @@ def _parse_tables(self, doc, match, attrs): A dictionary of table attributes that can be used to disambiguate mutliple tables on a page. + xpath : str or None + An XPath style string used to filter for tables to be returned. + Raises ------ ValueError @@ -393,7 +398,7 @@ def _parse_tbody(self, table): def _parse_tfoot(self, table): return table.find_all('tfoot') - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, doc, match, attrs, xpath): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) @@ -481,24 +486,33 @@ def _parse_tr(self, table): expr = './/tr[normalize-space()]' return table.xpath(expr) - def _parse_tables(self, doc, match, kwargs): - pattern = match.pattern + def _parse_tables(self, doc, match, kwargs, xpath): + if xpath: + xpath_expr = xpath + tables = doc.xpath(xpath_expr) + + if not tables: + raise ValueError("No tables found using XPath expression %s" % xpath) + return tables - # 1. check all descendants for the given pattern and only search tables - # 2. go up the tree until we find a table - query = '//table//*[re:test(text(), %r)]/ancestor::table' - xpath_expr = u(query) % pattern + else: + pattern = match.pattern - # if any table attributes were given build an xpath expression to - # search for them - if kwargs: - xpath_expr += _build_xpath_expr(kwargs) + # 1. check all descendants for the given pattern and only search tables + # 2. go up the tree until we find a table + query = '//table//*[re:test(text(), %r)]/ancestor::table' + xpath_expr = u(query) % pattern - tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) - if not tables: - raise ValueError("No tables found matching regex %r" % pattern) - return tables + tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + + if not tables: + raise ValueError("No tables found matching regex %r" % pattern) + return tables def _build_doc(self): """ @@ -688,7 +702,7 @@ def _validate_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, tupleize_cols, thousands, attrs): + parse_dates, tupleize_cols, thousands, attrs, xpath): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -696,7 +710,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs) + if xpath and flav in ('bs4', 'html5lib'): + raise NotImplementedError + + p = parser(io, compiled_match, attrs, xpath) try: tables = p.parse_tables() @@ -714,7 +731,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, infer_types=None, attrs=None, parse_dates=False, - tupleize_cols=False, thousands=','): + tupleize_cols=False, thousands=',', xpath=None): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -795,6 +812,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. + xpath : str or None, optional + If not ``None`` try to identify the set of tables to be read by an + XPath string; takes precedence over ``match``. Defaults to ``None``. + Note: This functionality is not (yet) available with the Beautiful Soup + parser (``flavor=bs4``). + Returns ------- dfs : list of DataFrames @@ -840,4 +863,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') return _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, tupleize_cols, thousands, attrs) + parse_dates, tupleize_cols, thousands, attrs, xpath) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index c26048d4cf20b..bec0fc8588ede 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -581,12 +581,28 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_xpath_bs4_not_implemented(self): + with open(self.spam_data) as f: + with self.assertRaises(NotImplementedError): + self.read_html(f, flavor='bs4', + xpath="//div[@class='garbage']/table") + class TestReadHtmlLxml(unittest.TestCase): @classmethod def setUpClass(cls): _skip_if_no('lxml') + def setup_data(self): + self.valid_data = os.path.join(DATA_PATH, 'valid_markup.html') + + def setup_flavor(self): + self.flavor = 'lxml' + + def setUp(self): + self.setup_data() + self.setup_flavor() + def read_html(self, *args, **kwargs): self.flavor = ['lxml'] kwargs['flavor'] = kwargs.get('flavor', self.flavor) @@ -630,6 +646,29 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_xpath_file_like(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + xpath="//table[@class='dataframe']") + + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + @slow + def test_xpath_file_url(self): + url = self.valid_data + dfs = self.read_html(file_path_to_url(url), + xpath="//*[@class='dataframe']") + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + def test_xpath_table_not_found(self): + with open(self.valid_data) as f: + with self.assertRaises(ValueError): + self.read_html(f, xpath="//div[@class='garbage']/table") + def test_invalid_flavor(): url = 'google.com' From de9451242d98b20cbf7e1f98e91d8f0c1f44d5ed Mon Sep 17 00:00:00 2001 From: phaebz Date: Sat, 2 Nov 2013 21:53:11 +0100 Subject: [PATCH 2/6] Coverage tests for `match` and `attr` parameters --- pandas/io/tests/test_html.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index bec0fc8588ede..6de03475815b1 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -646,6 +646,21 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_attrs_file_like(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + attrs={'class': 'dataframe'}) + + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + def test_match_no_match(self): + with open(self.valid_data) as f: + with self.assertRaises(ValueError): + dfs = self.read_html(f, + match='supercalifragilistic') + def test_xpath_file_like(self): with open(self.valid_data) as f: dfs = self.read_html(f, From 9a300b45f10c51a9805f1d96039bcf10eff0d2c9 Mon Sep 17 00:00:00 2001 From: phaebz Date: Sun, 3 Nov 2013 13:24:51 +0100 Subject: [PATCH 3/6] XPath expression has to match table elements only --- pandas/io/html.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/html.py b/pandas/io/html.py index a4e2af3b9f0c5..f205d55ef923a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -31,6 +31,7 @@ try: import lxml + from lxml.etree import XPathEvalError except ImportError: _HAS_LXML = False else: @@ -491,6 +492,9 @@ def _parse_tables(self, doc, match, kwargs, xpath): xpath_expr = xpath tables = doc.xpath(xpath_expr) + if not all(table.tag == 'table' for table in tables): + raise ValueError("XPath expression %s matched non-table elements" % xpath) + if not tables: raise ValueError("No tables found using XPath expression %s" % xpath) return tables From 563a95576d93baf959bc8feef17ee5d407d250a9 Mon Sep 17 00:00:00 2001 From: phaebz Date: Sun, 3 Nov 2013 13:25:23 +0100 Subject: [PATCH 4/6] Further testing for XPath feature --- pandas/io/html.py | 5 ++++- pandas/io/tests/test_html.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index f205d55ef923a..e0b093a6eedc3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -31,7 +31,6 @@ try: import lxml - from lxml.etree import XPathEvalError except ImportError: _HAS_LXML = False else: @@ -710,6 +709,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here + if xpath and not _HAS_LXML: + raise ValueError("XPath table selection needs the lxml module, " + "please install it.") + # hack around python 3 deleting the exception variable retained = None for flav in flavor: diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 6de03475815b1..8fced1fbbda0b 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -18,6 +18,11 @@ from numpy.random import rand from numpy.testing.decorators import slow +try: + from lxml.etree import XPathEvalError +except ImportError: + pass + from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import map, zip, StringIO, string_types @@ -679,11 +684,36 @@ def test_xpath_file_url(self): for df in dfs: tm.assert_isinstance(df, DataFrame) - def test_xpath_table_not_found(self): + def test_xpath_direct_ref(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + xpath="//html/body/table[@class='dataframe']" + "[last()]") + assert dfs[0].shape == (2, 3) + + def test_xpath_match_multiple(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + xpath="//*[@class='dataframe']") + + assert len(dfs) == 2 + + def test_xpath_match_none(self): with open(self.valid_data) as f: with self.assertRaises(ValueError): self.read_html(f, xpath="//div[@class='garbage']/table") + def test_xpath_not_all_tables(self): + with open(self.valid_data) as f: + with self.assertRaises(ValueError): + self.read_html(f, + xpath="//tr") + + def test_invalid_xpath(self): + with open(self.valid_data) as f: + with self.assertRaises(XPathEvalError): + self.read_html(f, xpath="//div[@@class=garbage]/table") + def test_invalid_flavor(): url = 'google.com' From 432269b86f4ab63c6b4abe3f29faf400f5dc80be Mon Sep 17 00:00:00 2001 From: phaebz Date: Thu, 7 Nov 2013 19:20:31 +0100 Subject: [PATCH 5/6] Correct format specifiers for XPath string formatting --- pandas/io/html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index e0b093a6eedc3..c7e7210247c8d 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -492,10 +492,10 @@ def _parse_tables(self, doc, match, kwargs, xpath): tables = doc.xpath(xpath_expr) if not all(table.tag == 'table' for table in tables): - raise ValueError("XPath expression %s matched non-table elements" % xpath) + raise ValueError("XPath expression %r matched non-table elements" % xpath) if not tables: - raise ValueError("No tables found using XPath expression %s" % xpath) + raise ValueError("No tables found using XPath expression %r" % xpath) return tables else: From c2bcbc90a957e17a982bfd55297934c55075b905 Mon Sep 17 00:00:00 2001 From: phaebz Date: Thu, 7 Nov 2013 19:23:56 +0100 Subject: [PATCH 6/6] Release notes addition --- doc/source/release.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 6e10bd651d90a..c779b25b30444 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -52,6 +52,8 @@ pandas 0.13.0 New features ~~~~~~~~~~~~ + - ``read_html()`` now accepts an ``xpath`` string argument representing an + xpath expression used for selecting tables to be read (:issue:`5416`) - ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set the bandwidth, and to gkde.evaluate() to specify the indicies at which it