diff --git a/doc/source/release.rst b/doc/source/release.rst index 6e10bd651d90a..c779b25b30444 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -52,6 +52,8 @@ pandas 0.13.0 New features ~~~~~~~~~~~~ + - ``read_html()`` now accepts an ``xpath`` string argument representing an + xpath expression used for selecting tables to be read (:issue:`5416`) - ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set the bandwidth, and to gkde.evaluate() to specify the indicies at which it diff --git a/pandas/io/html.py b/pandas/io/html.py index f3cfa3a16807a..c7e7210247c8d 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -165,13 +165,15 @@ class _HtmlFrameParser(object): See each method's respective documentation for details on their functionality. """ - def __init__(self, io, match, attrs): + def __init__(self, io, match, attrs, xpath): self.io = io self.match = match self.attrs = attrs + self.xpath = xpath def parse_tables(self): - tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + tables = self._parse_tables(self._build_doc(), self.match, self.attrs, + self.xpath) return (self._build_table(table) for table in tables) def _parse_raw_data(self, rows): @@ -227,7 +229,7 @@ def _parse_td(self, obj): """ raise NotImplementedError - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, doc, match, attrs, xpath): """Return all tables from the parsed DOM. Parameters @@ -242,6 +244,9 @@ def _parse_tables(self, doc, match, attrs): A dictionary of table attributes that can be used to disambiguate mutliple tables on a page. + xpath : str or None + An XPath style string used to filter for tables to be returned. + Raises ------ ValueError @@ -393,7 +398,7 @@ def _parse_tbody(self, table): def _parse_tfoot(self, table): return table.find_all('tfoot') - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, doc, match, attrs, xpath): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) @@ -481,24 +486,36 @@ def _parse_tr(self, table): expr = './/tr[normalize-space()]' return table.xpath(expr) - def _parse_tables(self, doc, match, kwargs): - pattern = match.pattern + def _parse_tables(self, doc, match, kwargs, xpath): + if xpath: + xpath_expr = xpath + tables = doc.xpath(xpath_expr) - # 1. check all descendants for the given pattern and only search tables - # 2. go up the tree until we find a table - query = '//table//*[re:test(text(), %r)]/ancestor::table' - xpath_expr = u(query) % pattern + if not all(table.tag == 'table' for table in tables): + raise ValueError("XPath expression %r matched non-table elements" % xpath) - # if any table attributes were given build an xpath expression to - # search for them - if kwargs: - xpath_expr += _build_xpath_expr(kwargs) + if not tables: + raise ValueError("No tables found using XPath expression %r" % xpath) + return tables - tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + else: + pattern = match.pattern - if not tables: - raise ValueError("No tables found matching regex %r" % pattern) - return tables + # 1. check all descendants for the given pattern and only search tables + # 2. go up the tree until we find a table + query = '//table//*[re:test(text(), %r)]/ancestor::table' + xpath_expr = u(query) % pattern + + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) + + tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + + if not tables: + raise ValueError("No tables found matching regex %r" % pattern) + return tables def _build_doc(self): """ @@ -688,15 +705,22 @@ def _validate_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, tupleize_cols, thousands, attrs): + parse_dates, tupleize_cols, thousands, attrs, xpath): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here + if xpath and not _HAS_LXML: + raise ValueError("XPath table selection needs the lxml module, " + "please install it.") + # hack around python 3 deleting the exception variable retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs) + if xpath and flav in ('bs4', 'html5lib'): + raise NotImplementedError + + p = parser(io, compiled_match, attrs, xpath) try: tables = p.parse_tables() @@ -714,7 +738,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, infer_types=None, attrs=None, parse_dates=False, - tupleize_cols=False, thousands=','): + tupleize_cols=False, thousands=',', xpath=None): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -795,6 +819,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. + xpath : str or None, optional + If not ``None`` try to identify the set of tables to be read by an + XPath string; takes precedence over ``match``. Defaults to ``None``. + Note: This functionality is not (yet) available with the Beautiful Soup + parser (``flavor=bs4``). + Returns ------- dfs : list of DataFrames @@ -840,4 +870,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') return _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, tupleize_cols, thousands, attrs) + parse_dates, tupleize_cols, thousands, attrs, xpath) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index c26048d4cf20b..8fced1fbbda0b 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -18,6 +18,11 @@ from numpy.random import rand from numpy.testing.decorators import slow +try: + from lxml.etree import XPathEvalError +except ImportError: + pass + from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import map, zip, StringIO, string_types @@ -581,12 +586,28 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_xpath_bs4_not_implemented(self): + with open(self.spam_data) as f: + with self.assertRaises(NotImplementedError): + self.read_html(f, flavor='bs4', + xpath="//div[@class='garbage']/table") + class TestReadHtmlLxml(unittest.TestCase): @classmethod def setUpClass(cls): _skip_if_no('lxml') + def setup_data(self): + self.valid_data = os.path.join(DATA_PATH, 'valid_markup.html') + + def setup_flavor(self): + self.flavor = 'lxml' + + def setUp(self): + self.setup_data() + self.setup_flavor() + def read_html(self, *args, **kwargs): self.flavor = ['lxml'] kwargs['flavor'] = kwargs.get('flavor', self.flavor) @@ -630,6 +651,69 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_attrs_file_like(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + attrs={'class': 'dataframe'}) + + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + def test_match_no_match(self): + with open(self.valid_data) as f: + with self.assertRaises(ValueError): + dfs = self.read_html(f, + match='supercalifragilistic') + + def test_xpath_file_like(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + xpath="//table[@class='dataframe']") + + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + @slow + def test_xpath_file_url(self): + url = self.valid_data + dfs = self.read_html(file_path_to_url(url), + xpath="//*[@class='dataframe']") + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + def test_xpath_direct_ref(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + xpath="//html/body/table[@class='dataframe']" + "[last()]") + assert dfs[0].shape == (2, 3) + + def test_xpath_match_multiple(self): + with open(self.valid_data) as f: + dfs = self.read_html(f, + xpath="//*[@class='dataframe']") + + assert len(dfs) == 2 + + def test_xpath_match_none(self): + with open(self.valid_data) as f: + with self.assertRaises(ValueError): + self.read_html(f, xpath="//div[@class='garbage']/table") + + def test_xpath_not_all_tables(self): + with open(self.valid_data) as f: + with self.assertRaises(ValueError): + self.read_html(f, + xpath="//tr") + + def test_invalid_xpath(self): + with open(self.valid_data) as f: + with self.assertRaises(XPathEvalError): + self.read_html(f, xpath="//div[@@class=garbage]/table") + def test_invalid_flavor(): url = 'google.com'