From 1c56365df9cdf47181205d07d1c381716c696d30 Mon Sep 17 00:00:00 2001 From: palewire Date: Tue, 14 Aug 2018 16:21:31 -0700 Subject: [PATCH 1/4] skip_rows substituted for skiprows --- pandas/_libs/parsers.pyx | 18 +++--- pandas/io/excel.py | 18 +++--- pandas/io/html.py | 32 ++++----- pandas/io/parsers.py | 68 ++++++++++---------- pandas/tests/io/parser/c_parser_only.py | 2 +- pandas/tests/io/parser/comment.py | 6 +- pandas/tests/io/parser/common.py | 22 +++---- pandas/tests/io/parser/header.py | 12 ++-- pandas/tests/io/parser/multithread.py | 2 +- pandas/tests/io/parser/na_values.py | 6 +- pandas/tests/io/parser/parse_dates.py | 2 +- pandas/tests/io/parser/python_parser_only.py | 4 +- pandas/tests/io/parser/skiprows.py | 44 ++++++------- pandas/tests/io/parser/test_read_fwf.py | 14 ++-- pandas/tests/io/parser/test_textreader.py | 2 +- pandas/tests/io/test_excel.py | 28 ++++---- pandas/tests/io/test_html.py | 42 ++++++------ 17 files changed, 161 insertions(+), 161 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index fba7f210b34a1..c4ed4876b8bf5 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -294,7 +294,7 @@ cdef class TextReader: object header, orig_header, names, header_start, header_end object index_col object low_memory - object skiprows + object skip_rows object dtype object encoding object compression @@ -348,7 +348,7 @@ cdef class TextReader: false_values=None, allow_leading_cols=True, low_memory=False, - skiprows=None, + skip_rows=None, skipfooter=0, verbose=False, mangle_dupe_cols=True, @@ -436,8 +436,8 @@ cdef class TextReader: self.parser.error_bad_lines = int(error_bad_lines) self.parser.warn_bad_lines = int(warn_bad_lines) - self.skiprows = skiprows - if skiprows is not None: + self.skip_rows = skip_rows + if skip_rows is not None: self._make_skiprow_set() self.skipfooter = skipfooter @@ -605,13 +605,13 @@ cdef class TextReader: self.parser.quotechar = ord(quote_char) cdef _make_skiprow_set(self): - if isinstance(self.skiprows, (int, np.integer)): - parser_set_skipfirstnrows(self.parser, self.skiprows) - elif not callable(self.skiprows): - for i in self.skiprows: + if isinstance(self.skip_rows, (int, np.integer)): + parser_set_skipfirstnrows(self.parser, self.skip_rows) + elif not callable(self.skip_rows): + for i in self.skip_rows: parser_add_skiprow(self.parser, i) else: - self.parser.skipfunc = self.skiprows + self.parser.skipfunc = self.skip_rows cdef _setup_parser_source(self, source): cdef: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index e2db6643c5ef0..01ea4a139915f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -130,7 +130,7 @@ .. versionadded:: 0.19.0 -skiprows : list-like +skip_rows : list-like Rows to skip at the beginning (0-indexed) nrows : int, default None Number of rows to parse @@ -295,7 +295,7 @@ def read_excel(io, converters=None, true_values=None, false_values=None, - skiprows=None, + skip_rows=None, nrows=None, na_values=None, parse_dates=False, @@ -330,7 +330,7 @@ def read_excel(io, converters=converters, true_values=true_values, false_values=false_values, - skiprows=skiprows, + skip_rows=skip_rows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, @@ -422,7 +422,7 @@ def parse(self, converters=None, true_values=None, false_values=None, - skiprows=None, + skip_rows=None, nrows=None, na_values=None, parse_dates=False, @@ -457,7 +457,7 @@ def parse(self, converters=converters, true_values=true_values, false_values=false_values, - skiprows=skiprows, + skip_rows=skip_rows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, @@ -511,7 +511,7 @@ def _parse_excel(self, dtype=None, true_values=None, false_values=None, - skiprows=None, + skip_rows=None, nrows=None, na_values=None, verbose=False, @@ -649,8 +649,8 @@ def _parse_cell(cell_contents, cell_typ): header_names = [] control_row = [True] * len(data[0]) for row in header: - if is_integer(skiprows): - row += skiprows + if is_integer(skip_rows): + row += skip_rows data[row], control_row = _fill_mi_header( data[row], control_row) @@ -687,7 +687,7 @@ def _parse_cell(cell_contents, cell_typ): dtype=dtype, true_values=true_values, false_values=false_values, - skiprows=skiprows, + skip_rows=skip_rows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, diff --git a/pandas/io/html.py b/pandas/io/html.py index cca27db00f48d..0b8dd0aa5b31a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -85,32 +85,32 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): return regex.sub(' ', s.strip()) -def _get_skiprows(skiprows): +def _get_skiprows(skip_rows): """Get an iterator given an integer, slice or container. Parameters ---------- - skiprows : int, slice, container + skip_rows : int, slice, container The iterator to use to skip rows; can also be a slice. Raises ------ TypeError - * If `skiprows` is not a slice, integer, or Container + * If `skip_rows` is not a slice, integer, or Container Returns ------- it : iterable A proper iterator to use to skip rows of a DataFrame. """ - if isinstance(skiprows, slice): - return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) - elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): - return skiprows - elif skiprows is None: + if isinstance(skip_rows, slice): + return lrange(skip_rows.start or 0, skip_rows.stop, skip_rows.step or 1) + elif isinstance(skip_rows, numbers.Integral) or is_list_like(skip_rows): + return skip_rows + elif skip_rows is None: return 0 raise TypeError('%r is not a valid type for skipping rows' % - type(skiprows).__name__) + type(skip_rows).__name__) def _read(obj): @@ -779,7 +779,7 @@ def _expand_elements(body): def _data_to_frame(**kwargs): head, body, foot = kwargs.pop('data') header = kwargs.pop('header') - kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) + kwargs['skip_rows'] = _get_skiprows(kwargs['skip_rows']) if head: body = head + body @@ -922,7 +922,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): def read_html(io, match='.+', flavor=None, header=None, index_col=None, - skiprows=None, attrs=None, parse_dates=False, + skip_rows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, keep_default_na=True, displayed_only=True): @@ -956,7 +956,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, index_col : int or list-like or None, optional The column (or list of columns) to use to create the index. - skiprows : int or list-like or slice or None, optional + skip_rows : int or list-like or slice or None, optional 0-based. Number of rows to skip after parsing the column integer. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth @@ -1060,7 +1060,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.21.0 Similar to :func:`~pandas.read_csv` the `header` argument is applied - **after** `skiprows` is applied. + **after** `skip_rows` is applied. This function will *always* return a list of :class:`DataFrame` *or* it will fail, e.g., it will *not* return an empty list. @@ -1077,13 +1077,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, _importers() # Type check here. We don't want to parse only to fail because of an - # invalid value of an integer skiprows. - if isinstance(skiprows, numbers.Integral) and skiprows < 0: + # invalid value of an integer skip_rows. + if isinstance(skip_rows, numbers.Integral) and skip_rows < 0: raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') _validate_header_arg(header) return _parse(flavor=flavor, io=io, match=match, header=header, - index_col=index_col, skiprows=skiprows, + index_col=index_col, skip_rows=skip_rows, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4b3fa08e5e4af..b7bd298834243 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -144,7 +144,7 @@ Values to consider as False skipinitialspace : boolean, default False Skip spaces after delimiter. -skiprows : list-like or integer or callable, default None +skip_rows : list-like or integer or callable, default None Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file. @@ -264,7 +264,7 @@ of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), fully commented lines are ignored by the parameter `header` but not by - `skiprows`. For example, if ``comment='#'``, parsing + `skip_rows`. For example, if ``comment='#'``, parsing ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being treated as the header. encoding : str, default None @@ -347,7 +347,7 @@ fields of each line as half-open intervals (i.e., [from, to[ ). String value 'infer' can be used to instruct the parser to try detecting the column specifications from the first 100 rows of - the data which are not being skipped via skiprows (default='infer'). + the data which are not being skipped via skip_rows (default='infer'). widths : list of ints. optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. @@ -479,7 +479,7 @@ def _read(filepath_or_buffer, kwds): 'index_col': None, 'names': None, 'prefix': None, - 'skiprows': None, + 'skip_rows': None, 'na_values': None, 'true_values': None, 'false_values': None, @@ -572,7 +572,7 @@ def parser_f(filepath_or_buffer, true_values=None, false_values=None, skipinitialspace=False, - skiprows=None, + skip_rows=None, nrows=None, # NA and Missing Data Handling @@ -664,7 +664,7 @@ def parser_f(filepath_or_buffer, index_col=index_col, names=names, prefix=prefix, - skiprows=skiprows, + skip_rows=skip_rows, na_values=na_values, true_values=true_values, false_values=false_values, @@ -960,7 +960,7 @@ def _clean_options(self, options, engine): names = options['names'] converters = options['converters'] na_values = options['na_values'] - skiprows = options['skiprows'] + skip_rows = options['skip_rows'] _validate_header_arg(options['header']) @@ -1009,22 +1009,22 @@ def _clean_options(self, options, engine): keep_default_na = options['keep_default_na'] na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) - # handle skiprows; this is internally handled by the + # handle skip_rows; this is internally handled by the # c-engine, so only need for python parsers if engine != 'c': - if is_integer(skiprows): - skiprows = lrange(skiprows) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) + if is_integer(skip_rows): + skip_rows = lrange(skip_rows) + if skip_rows is None: + skip_rows = set() + elif not callable(skip_rows): + skip_rows = set(skip_rows) # put stuff back result['names'] = names result['converters'] = converters result['na_values'] = na_values result['na_fvalues'] = na_fvalues - result['skiprows'] = skiprows + result['skip_rows'] = skip_rows return result, engine @@ -2007,7 +2007,7 @@ def TextParser(*args, **kwds): parse_dates : boolean, default False keep_date_col : boolean, default False date_parser : function, default None - skiprows : list of integers + skip_rows : list of integers Row numbers to skip skipfooter : int Number of line at bottom of file to skip @@ -2056,12 +2056,12 @@ def __init__(self, f, **kwds): self.encoding = kwds['encoding'] self.compression = kwds['compression'] self.memory_map = kwds['memory_map'] - self.skiprows = kwds['skiprows'] + self.skip_rows = kwds['skip_rows'] - if callable(self.skiprows): - self.skipfunc = self.skiprows + if callable(self.skip_rows): + self.skipfunc = self.skip_rows else: - self.skipfunc = lambda x: x in self.skiprows + self.skipfunc = lambda x: x in self.skip_rows self.skipfooter = _validate_skipfooter_arg(kwds['skipfooter']) self.delimiter = kwds['delimiter'] @@ -2974,8 +2974,8 @@ def _get_lines(self, rows=None): new_rows = self.data[self.pos:self.pos + rows] new_pos = self.pos + rows - # Check for stop rows. n.b.: self.skiprows is a set. - if self.skiprows: + # Check for stop rows. n.b.: self.skip_rows is a set. + if self.skip_rows: new_rows = [row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)] @@ -3001,7 +3001,7 @@ def _get_lines(self, rows=None): new_rows.append(new_row) except StopIteration: - if self.skiprows: + if self.skip_rows: new_rows = [row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)] lines.extend(new_rows) @@ -3365,13 +3365,13 @@ class FixedWidthReader(BaseIterator): A reader of fixed-width lines. """ - def __init__(self, f, colspecs, delimiter, comment, skiprows=None): + def __init__(self, f, colspecs, delimiter, comment, skip_rows=None): self.f = f self.buffer = None self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' self.comment = comment if colspecs == 'infer': - self.colspecs = self.detect_colspecs(skiprows=skiprows) + self.colspecs = self.detect_colspecs(skip_rows=skip_rows) else: self.colspecs = colspecs @@ -3387,14 +3387,14 @@ def __init__(self, f, colspecs, delimiter, comment, skiprows=None): raise TypeError('Each column specification must be ' '2 element tuple or list of integers') - def get_rows(self, n, skiprows=None): + def get_rows(self, n, skip_rows=None): """ Read rows from self.f, skipping as specified. We distinguish buffer_rows (the first <= n lines) from the rows returned to detect_colspecs because it's simpler to leave the other locations with - skiprows logic alone than to modify them to deal + skip_rows logic alone than to modify them to deal with the fact we skipped some rows here as well. Parameters @@ -3402,7 +3402,7 @@ def get_rows(self, n, skiprows=None): n : int Number of rows to read from self.f, not counting rows that are skipped. - skiprows: set, optional + skip_rows: set, optional Indices of rows to skip. Returns @@ -3411,12 +3411,12 @@ def get_rows(self, n, skiprows=None): A list containing the rows to read. """ - if skiprows is None: - skiprows = set() + if skip_rows is None: + skip_rows = set() buffer_rows = [] detect_rows = [] for i, row in enumerate(self.f): - if i not in skiprows: + if i not in skip_rows: detect_rows.append(row) buffer_rows.append(row) if len(detect_rows) >= n: @@ -3424,11 +3424,11 @@ def get_rows(self, n, skiprows=None): self.buffer = iter(buffer_rows) return detect_rows - def detect_colspecs(self, n=100, skiprows=None): + def detect_colspecs(self, n=100, skip_rows=None): # Regex escape the delimiters delimiters = ''.join(r'\%s' % x for x in self.delimiter) pattern = re.compile('([^%s]+)' % delimiters) - rows = self.get_rows(n, skiprows) + rows = self.get_rows(n, skip_rows) if not rows: raise EmptyDataError("No rows from which to infer column width") max_len = max(map(len, rows)) @@ -3471,4 +3471,4 @@ def __init__(self, f, **kwds): def _make_reader(self, f): self.data = FixedWidthReader(f, self.colspecs, self.delimiter, - self.comment, self.skiprows) + self.comment, self.skip_rows) diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 9dc7b070f889d..a0caa92dd3ca8 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -423,7 +423,7 @@ def test_comment_whitespace_delimited(self): 9 2 3 # skipped line # comment""" df = self.read_csv(StringIO(test_input), comment='#', header=None, - delimiter='\\s+', skiprows=0, + delimiter='\\s+', skip_rows=0, error_bad_lines=False) error = sys.stderr.getvalue() # skipped lines 2, 3, 4, 9 diff --git a/pandas/tests/io/parser/comment.py b/pandas/tests/io/parser/comment.py index 9987a017cf985..4d2e793d082de 100644 --- a/pandas/tests/io/parser/comment.py +++ b/pandas/tests/io/parser/comment.py @@ -65,7 +65,7 @@ def test_comment_skiprows(self): """ # this should ignore the first four lines (including comments) expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data), comment='#', skiprows=4) + df = self.read_csv(StringIO(data), comment='#', skip_rows=4) tm.assert_numpy_array_equal(df.values, expected) def test_comment_header(self): @@ -91,11 +91,11 @@ def test_comment_skiprows_header(self): 1,2.,4. 5.,NaN,10.0 """ - # skiprows should skip the first 4 lines (including comments), while + # skip_rows should skip the first 4 lines (including comments), while # header should start from the second non-commented line starting # with line 5 expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + df = self.read_csv(StringIO(data), comment='#', skip_rows=4, header=1) tm.assert_numpy_array_equal(df.values, expected) def test_custom_comment_char(self): diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 9e871d27f0ce8..38625763522bd 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -146,7 +146,7 @@ def test_malformed(self): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, - skiprows=[2]) + skip_rows=[2]) it.read(5) # middle chunk @@ -162,7 +162,7 @@ def test_malformed(self): with tm.assert_raises_regex(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, - skiprows=[2]) + skip_rows=[2]) it.read(3) # last chunk @@ -178,7 +178,7 @@ def test_malformed(self): with tm.assert_raises_regex(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, - skiprows=[2]) + skip_rows=[2]) it.read() # skipfooter is not supported with the C parser yet @@ -507,8 +507,8 @@ def test_iterator(self): tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) - # pass skiprows - parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) + # pass skip_rows + parser = TextParser(lines, index_col=0, chunksize=2, skip_rows=[1]) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[1:3]) @@ -745,9 +745,9 @@ def test_utf16_bom_skiprows(self): from io import TextIOWrapper s = TextIOWrapper(s, encoding='utf-8') - result = self.read_csv(path, encoding=enc, skiprows=2, + result = self.read_csv(path, encoding=enc, skip_rows=2, sep=sep) - expected = self.read_csv(s, encoding='utf-8', skiprows=2, + expected = self.read_csv(s, encoding='utf-8', skip_rows=2, sep=sep) s.close() @@ -1041,7 +1041,7 @@ def test_eof_states(self): # SKIP_LINE data = 'a,b,c\n4,5,6\nskipme' - result = self.read_csv(StringIO(data), skiprows=[2]) + result = self.read_csv(StringIO(data), skip_rows=[2]) tm.assert_frame_equal(result, expected) # With skip_blank_lines = False @@ -1144,11 +1144,11 @@ def test_trailing_spaces(self): # lines with trailing whitespace and blank lines df = self.read_csv(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) + skip_rows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], + skip_rows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) @@ -1157,7 +1157,7 @@ def test_trailing_spaces(self): "C": [4., 10]}) df = self.read_table(StringIO(data.replace(',', ' ')), delim_whitespace=True, - skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) + skip_rows=[1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) def test_raise_on_sep_with_delim_whitespace(self): diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py index ad3d4592bd599..13a36edd62797 100644 --- a/pandas/tests/io/parser/header.py +++ b/pandas/tests/io/parser/header.py @@ -152,7 +152,7 @@ def test_header_multiindex_common_format(self): tm.assert_frame_equal(df, result) # to_csv, tuples - result = self.read_csv(StringIO(data), skiprows=3, + result = self.read_csv(StringIO(data), skip_rows=3, names=[('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')], index_col=0) @@ -161,7 +161,7 @@ def test_header_multiindex_common_format(self): # to_csv, namedtuples TestTuple = namedtuple('names', ['first', 'second']) result = self.read_csv( - StringIO(data), skiprows=3, index_col=0, + StringIO(data), skip_rows=3, index_col=0, names=[TestTuple('a', 'q'), TestTuple('a', 'r'), TestTuple('a', 's'), TestTuple('b', 't'), TestTuple('c', 'u'), TestTuple('c', 'v')]) @@ -177,7 +177,7 @@ def test_header_multiindex_common_format(self): tm.assert_frame_equal(df, result) # common, tuples - result = self.read_csv(StringIO(data), skiprows=2, + result = self.read_csv(StringIO(data), skip_rows=2, names=[('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')], index_col=0) @@ -186,7 +186,7 @@ def test_header_multiindex_common_format(self): # common, namedtuples TestTuple = namedtuple('names', ['first', 'second']) result = self.read_csv( - StringIO(data), skiprows=2, index_col=0, + StringIO(data), skip_rows=2, index_col=0, names=[TestTuple('a', 'q'), TestTuple('a', 'r'), TestTuple('a', 's'), TestTuple('b', 't'), TestTuple('c', 'u'), TestTuple('c', 'v')]) @@ -202,7 +202,7 @@ def test_header_multiindex_common_format(self): tm.assert_frame_equal(df.reset_index(drop=True), result) # common, no index_col, tuples - result = self.read_csv(StringIO(data), skiprows=2, + result = self.read_csv(StringIO(data), skip_rows=2, names=[('a', 'q'), ('a', 'r'), ('a', 's'), ('b', 't'), ('c', 'u'), ('c', 'v')], index_col=None) @@ -211,7 +211,7 @@ def test_header_multiindex_common_format(self): # common, no index_col, namedtuples TestTuple = namedtuple('names', ['first', 'second']) result = self.read_csv( - StringIO(data), skiprows=2, index_col=None, + StringIO(data), skip_rows=2, index_col=None, names=[TestTuple('a', 'q'), TestTuple('a', 'r'), TestTuple('a', 's'), TestTuple('b', 't'), TestTuple('c', 'u'), TestTuple('c', 'v')]) diff --git a/pandas/tests/io/parser/multithread.py b/pandas/tests/io/parser/multithread.py index 2aaef889db6de..0fb176c09fee9 100644 --- a/pandas/tests/io/parser/multithread.py +++ b/pandas/tests/io/parser/multithread.py @@ -43,7 +43,7 @@ def reader(arg): return self.read_csv(path, index_col=0, header=None, - skiprows=int(start) + 1, + skip_rows=int(start) + 1, nrows=nrows, parse_dates=[9]) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 880ab707cfd07..142b6c8dd5078 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -104,15 +104,15 @@ def test_custom_na_values(self): [nan, 5, nan], [7, 8, nan]]) - df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) + df = self.read_csv(StringIO(data), na_values=['baz'], skip_rows=[1]) tm.assert_numpy_array_equal(df.values, expected) df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], - skiprows=[1]) + skip_rows=[1]) tm.assert_numpy_array_equal(df2.values, expected) df3 = self.read_table(StringIO(data), sep=',', na_values='baz', - skiprows=[1]) + skip_rows=[1]) tm.assert_numpy_array_equal(df3.values, expected) def test_bool_na_values(self): diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index 1bf055854de88..8fa2baeba2275 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -349,7 +349,7 @@ def test_parse_dates_custom_euroformat(self): parser = lambda d: parse_date(d, day_first=True) pytest.raises(TypeError, self.read_csv, - StringIO(text), skiprows=[0], + StringIO(text), skip_rows=[0], names=['time', 'Q', 'NTU'], index_col=0, parse_dates=True, date_parser=parser, na_values=['NA']) diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index c0616ebbab4a5..6f9d3031fcba0 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -68,7 +68,7 @@ def test_sniff_delimiter(self): baz|7|8|9 """ data3 = self.read_csv(StringIO(text), index_col=0, - sep=None, skiprows=2) + sep=None, skip_rows=2) tm.assert_frame_equal(data, data3) text = u("""ignore this @@ -85,7 +85,7 @@ def test_sniff_delimiter(self): from io import TextIOWrapper s = TextIOWrapper(s, encoding='utf-8') - data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2, + data4 = self.read_csv(s, index_col=0, sep=None, skip_rows=2, encoding='utf-8') tm.assert_frame_equal(data, data4) diff --git a/pandas/tests/io/parser/skiprows.py b/pandas/tests/io/parser/skiprows.py index fb08ec0447267..7a64e0c272958 100644 --- a/pandas/tests/io/parser/skiprows.py +++ b/pandas/tests/io/parser/skiprows.py @@ -30,10 +30,10 @@ def test_skiprows_bug(self): 1/2/2000,4,5,6 1/3/2000,7,8,9 """ - data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None, + data = self.read_csv(StringIO(text), skip_rows=lrange(6), header=None, index_col=0, parse_dates=True) - data2 = self.read_csv(StringIO(text), skiprows=6, header=None, + data2 = self.read_csv(StringIO(text), skip_rows=6, header=None, index_col=0, parse_dates=True) expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), @@ -52,7 +52,7 @@ def test_deep_skiprows(self): condensed_text = "a,b,c\n" + \ "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]) - data = self.read_csv(StringIO(text), skiprows=[6, 8]) + data = self.read_csv(StringIO(text), skip_rows=[6, 8]) condensed_data = self.read_csv(StringIO(condensed_text)) tm.assert_frame_equal(data, condensed_data) @@ -68,7 +68,7 @@ def test_skiprows_blank(self): 1/2/2000,4,5,6 1/3/2000,7,8,9 """ - data = self.read_csv(StringIO(text), skiprows=6, header=None, + data = self.read_csv(StringIO(text), skip_rows=6, header=None, index_col=0, parse_dates=True) expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), @@ -90,7 +90,7 @@ def test_skiprow_with_newline(self): [3, 'line 31', 1]] expected = DataFrame(expected, columns=[ 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) + df = self.read_csv(StringIO(data), skip_rows=[1]) tm.assert_frame_equal(df, expected) data = ('a,b,c\n~a\n b~,~e\n d~,' @@ -100,7 +100,7 @@ def test_skiprow_with_newline(self): 'a', 'b', 'c']) df = self.read_csv(StringIO(data), quotechar="~", - skiprows=[2]) + skip_rows=[2]) tm.assert_frame_equal(df, expected) data = ('Text,url\n~example\n ' @@ -112,7 +112,7 @@ def test_skiprow_with_newline(self): 'Text', 'url']) df = self.read_csv(StringIO(data), quotechar="~", - skiprows=[1, 3]) + skip_rows=[1, 3]) tm.assert_frame_equal(df, expected) def test_skiprow_with_quote(self): @@ -125,7 +125,7 @@ def test_skiprow_with_quote(self): [3, "line '31' line 32", 1]] expected = DataFrame(expected, columns=[ 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) + df = self.read_csv(StringIO(data), skip_rows=[1]) tm.assert_frame_equal(df, expected) def test_skiprow_with_newline_and_quote(self): @@ -138,7 +138,7 @@ def test_skiprow_with_newline_and_quote(self): [3, "line \n'31' line 32", 1]] expected = DataFrame(expected, columns=[ 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) + df = self.read_csv(StringIO(data), skip_rows=[1]) tm.assert_frame_equal(df, expected) data = """id,text,num_lines @@ -149,7 +149,7 @@ def test_skiprow_with_newline_and_quote(self): [3, "line '31\n' line 32", 1]] expected = DataFrame(expected, columns=[ 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) + df = self.read_csv(StringIO(data), skip_rows=[1]) tm.assert_frame_equal(df, expected) data = """id,text,num_lines @@ -160,7 +160,7 @@ def test_skiprow_with_newline_and_quote(self): [3, "line '31\n' \r\tline 32", 1]] expected = DataFrame(expected, columns=[ 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) + df = self.read_csv(StringIO(data), skip_rows=[1]) tm.assert_frame_equal(df, expected) def test_skiprows_lineterminator(self): @@ -176,19 +176,19 @@ def test_skiprows_lineterminator(self): 'oflag']) # test with default line terminators "LF" and "CRLF" - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + df = self.read_csv(StringIO(data), skip_rows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, + skip_rows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) # "CR" is not respected with the Python parser yet if self.engine == 'c': df = self.read_csv(StringIO(data.replace('\n', '\r')), - skiprows=1, delim_whitespace=True, + skip_rows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) @@ -197,29 +197,29 @@ def test_skiprows_infield_quote(self): data = 'a"\nb"\na\n1' expected = DataFrame({'a': [1]}) - df = self.read_csv(StringIO(data), skiprows=2) + df = self.read_csv(StringIO(data), skip_rows=2) tm.assert_frame_equal(df, expected) def test_skiprows_callable(self): data = 'a\n1\n2\n3\n4\n5' - skiprows = lambda x: x % 2 == 0 + skip_rows = lambda x: x % 2 == 0 expected = DataFrame({'1': [3, 5]}) - df = self.read_csv(StringIO(data), skiprows=skiprows) + df = self.read_csv(StringIO(data), skip_rows=skip_rows) tm.assert_frame_equal(df, expected) expected = DataFrame({'foo': [3, 5]}) - df = self.read_csv(StringIO(data), skiprows=skiprows, + df = self.read_csv(StringIO(data), skip_rows=skip_rows, header=0, names=['foo']) tm.assert_frame_equal(df, expected) - skiprows = lambda x: True + skip_rows = lambda x: True msg = "No columns to parse from file" with tm.assert_raises_regex(EmptyDataError, msg): - self.read_csv(StringIO(data), skiprows=skiprows) + self.read_csv(StringIO(data), skip_rows=skip_rows) # This is a bad callable and should raise. msg = "by zero" - skiprows = lambda x: 1 / 0 + skip_rows = lambda x: 1 / 0 with tm.assert_raises_regex(ZeroDivisionError, msg): - self.read_csv(StringIO(data), skiprows=skiprows) + self.read_csv(StringIO(data), skip_rows=skip_rows) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index a60f2b5a4c946..8d07613ca695c 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -377,10 +377,10 @@ def test_skiprows_inference(self): 0.0 1.0 101.6 956.1 """.strip() - expected = read_csv(StringIO(test), skiprows=2, + expected = read_csv(StringIO(test), skip_rows=2, delim_whitespace=True) tm.assert_frame_equal(expected, read_fwf( - StringIO(test), skiprows=2)) + StringIO(test), skip_rows=2)) def test_skiprows_by_index_inference(self): test = """ @@ -391,10 +391,10 @@ def test_skiprows_by_index_inference(self): 456 78 9 456 """.strip() - expected = read_csv(StringIO(test), skiprows=[0, 2], + expected = read_csv(StringIO(test), skip_rows=[0, 2], delim_whitespace=True) tm.assert_frame_equal(expected, read_fwf( - StringIO(test), skiprows=[0, 2])) + StringIO(test), skip_rows=[0, 2])) def test_skiprows_inference_empty(self): test = """ @@ -404,7 +404,7 @@ def test_skiprows_inference_empty(self): """.strip() with pytest.raises(EmptyDataError): - read_fwf(StringIO(test), skiprows=3) + read_fwf(StringIO(test), skip_rows=3) def test_whitespace_preservation(self): # Addresses Issue #16772 @@ -417,7 +417,7 @@ def test_whitespace_preservation(self): a bbb ccdd """ result = read_fwf(StringIO(test_data), widths=[3, 3], - header=None, skiprows=[0], delimiter="\n\t") + header=None, skip_rows=[0], delimiter="\n\t") tm.assert_frame_equal(result, expected) @@ -431,6 +431,6 @@ def test_default_delimiter(self): a \tbbb cc\tdd """ result = read_fwf(StringIO(test_data), widths=[3, 3], - header=None, skiprows=[0]) + header=None, skip_rows=[0]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index c7026e3e0fc88..fccb240c3f8e5 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -329,7 +329,7 @@ def test_empty_field_eof(self): index=[0, 5, 7, 12]) for _ in range(100): - df = read_csv(StringIO('a,b\nc\n'), skiprows=0, + df = read_csv(StringIO('a,b\nc\n'), skip_rows=0, names=['a'], engine='c') assert_frame_equal(df, a) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index fa5a8f6a1900c..fe73c85f8ebe4 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -110,11 +110,11 @@ def test_usecols_int(self, ext): dfref = self.get_csv_refdf('test1') dfref = dfref.reindex(columns=['A', 'B', 'C']) df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols=3) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df2 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, usecols=3) with tm.assert_produces_warning(FutureWarning): - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df3 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, parse_cols=3) # TODO add index to xls file) @@ -128,11 +128,11 @@ def test_usecols_list(self, ext): dfref = dfref.reindex(columns=['B', 'C']) df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols=[0, 2, 3]) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df2 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, usecols=[0, 2, 3]) with tm.assert_produces_warning(FutureWarning): - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df3 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, parse_cols=[0, 2, 3]) # TODO add index to xls file) @@ -147,11 +147,11 @@ def test_usecols_str(self, ext): df1 = dfref.reindex(columns=['A', 'B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols='A:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df3 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, usecols='A:D') with tm.assert_produces_warning(FutureWarning): - df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df4 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, parse_cols='A:D') # TODO add index to xls, read xls ignores index name ? @@ -162,7 +162,7 @@ def test_usecols_str(self, ext): df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols='A,C,D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df3 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, usecols='A,C,D') # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) @@ -171,7 +171,7 @@ def test_usecols_str(self, ext): df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols='A,C:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df3 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0, usecols='A,C:D') tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) @@ -235,12 +235,12 @@ def test_excel_table_sheet_by_index(self, ext): dfref = self.get_csv_refdf('test1') df1 = read_excel(excel, 0, index_col=0) - df2 = read_excel(excel, 1, skiprows=[1], index_col=0) + df2 = read_excel(excel, 1, skip_rows=[1], index_col=0) tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) df1 = excel.parse(0, index_col=0) - df2 = excel.parse(1, skiprows=[1], index_col=0) + df2 = excel.parse(1, skip_rows=[1], index_col=0) tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) @@ -263,7 +263,7 @@ def test_excel_table(self, ext): dfref = self.get_csv_refdf('test1') df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + df2 = self.get_exceldf('test1', ext, 'Sheet2', skip_rows=[1], index_col=0) # TODO add index to file tm.assert_frame_equal(df1, dfref, check_names=False) @@ -773,7 +773,7 @@ def test_read_excel_multiindex(self, ext): tm.assert_frame_equal(actual, expected) actual = read_excel(mi_file, 'both_name_skiprows', index_col=[0, 1], - header=[0, 1], skiprows=2) + header=[0, 1], skip_rows=2) tm.assert_frame_equal(actual, expected) @td.skip_if_no('xlsxwriter') @@ -968,7 +968,7 @@ def test_read_excel_skiprows_list(self, ext): # GH 4903 actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + ext), - 'skiprows_list', skiprows=[0, 2]) + 'skiprows_list', skip_rows=[0, 2]) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], [2, 3.5, pd.Timestamp('2015-01-02'), False], [3, 4.5, pd.Timestamp('2015-01-03'), False], @@ -978,7 +978,7 @@ def test_read_excel_skiprows_list(self, ext): actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + ext), - 'skiprows_list', skiprows=np.array([0, 2])) + 'skiprows_list', skip_rows=np.array([0, 2])) tm.assert_frame_equal(actual, expected) def test_read_excel_nrows(self, ext): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index e08899a03d2d7..9a92db5449112 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -153,57 +153,57 @@ def test_spam_header(self): assert not df.empty def test_skiprows_int(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=1) + df2 = self.read_html(self.spam_data, 'Unit', skip_rows=1) assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0] - df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] + df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=range(2))[0] + df2 = self.read_html(self.spam_data, 'Unit', skip_rows=range(2))[0] tm.assert_frame_equal(df1, df2) def test_skiprows_list(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1]) + df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=[1, 2]) + df2 = self.read_html(self.spam_data, 'Unit', skip_rows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows={1, 2}) - df2 = self.read_html(self.spam_data, 'Unit', skiprows={2, 1}) + df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows={1, 2}) + df2 = self.read_html(self.spam_data, 'Unit', skip_rows={2, 1}) assert_framelist_equal(df1, df2) def test_skiprows_slice(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=1) + df2 = self.read_html(self.spam_data, 'Unit', skip_rows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) + df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=slice(2)) + df2 = self.read_html(self.spam_data, 'Unit', skip_rows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) + df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=slice(2, 5)) + df2 = self.read_html(self.spam_data, 'Unit', skip_rows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=np.arange(2)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) + skip_rows=np.arange(2)) + df2 = self.read_html(self.spam_data, 'Unit', skip_rows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): with tm.assert_raises_regex(TypeError, 'is not a valid type ' 'for skipping rows'): - self.read_html(self.spam_data, '.*Water.*', skiprows='asdf') + self.read_html(self.spam_data, '.*Water.*', skip_rows='asdf') def test_index(self): df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) @@ -312,18 +312,18 @@ def test_multiindex_header_index(self): @pytest.mark.slow def test_multiindex_header_skiprows_tuples(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df = self._bank_data(header=[0, 1], skiprows=1, + df = self._bank_data(header=[0, 1], skip_rows=1, tupleize_cols=True)[0] assert isinstance(df.columns, Index) @pytest.mark.slow def test_multiindex_header_skiprows(self): - df = self._bank_data(header=[0, 1], skiprows=1)[0] + df = self._bank_data(header=[0, 1], skip_rows=1)[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_multiindex_header_index_skiprows(self): - df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] + df = self._bank_data(header=[0, 1], index_col=[0, 1], skip_rows=1)[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) @@ -340,7 +340,7 @@ def test_regex_idempotency(self): def test_negative_skiprows(self): with tm.assert_raises_regex(ValueError, r'\(you passed a negative value\)'): - self.read_html(self.spam_data, 'Water', skiprows=-1) + self.read_html(self.spam_data, 'Water', skip_rows=-1) @network def test_multiple_matches(self): From 1cf20a84c8688f99143ad8b6420c092bcac4bd72 Mon Sep 17 00:00:00 2001 From: palewire Date: Tue, 14 Aug 2018 18:24:14 -0700 Subject: [PATCH 2/4] Updated skiprows to skip_rows in relevant docs --- asv_bench/benchmarks/io/csv.py | 8 ++++---- doc/source/cookbook.rst | 2 +- doc/source/io.rst | 20 ++++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 2d4bdc7ae812a..abdd6fc438587 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -87,9 +87,9 @@ class ReadCSVSkipRows(BaseIO): goal_time = 0.2 fname = '__test__.csv' params = [None, 10000] - param_names = ['skiprows'] + param_names = ['skip_rows'] - def setup(self, skiprows): + def setup(self, skip_rows): N = 20000 index = tm.makeStringIndex(N) df = DataFrame({'float1': np.random.randn(N), @@ -100,8 +100,8 @@ def setup(self, skiprows): index=index) df.to_csv(self.fname) - def time_skipprows(self, skiprows): - read_csv(self.fname, skiprows=skiprows) + def time_skipprows(self, skip_rows): + read_csv(self.fname, skip_rows=skip_rows) class ReadUint64Integers(StringIORewind): diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index f6fa9e9f86143..a4576060fec2b 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1034,7 +1034,7 @@ Option 1: pass rows explicitly to skip rows .. ipython:: python - pd.read_csv(StringIO(data), sep=';', skiprows=[11,12], + pd.read_csv(StringIO(data), sep=';', skip_rows=[11,12], index_col=0, parse_dates=True, header=10) Option 2: read column names and then data diff --git a/doc/source/io.rst b/doc/source/io.rst index c2c8c1c17700f..d9888879e2317 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -186,7 +186,7 @@ false_values : list, default ``None`` Values to consider as ``False``. skipinitialspace : boolean, default ``False`` Skip spaces after delimiter. -skiprows : list-like or integer, default ``None`` +skip_rows : list-like or integer, default ``None`` Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file. @@ -197,7 +197,7 @@ skiprows : list-like or integer, default ``None`` data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) + pd.read_csv(StringIO(data), skip_rows=lambda x: x % 2 != 0) skipfooter : int, default ``0`` Number of lines at bottom of file to skip (unsupported with engine='c'). @@ -326,7 +326,7 @@ comment : str, default ``None`` Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), fully - commented lines are ignored by the parameter `header` but not by `skiprows`. + commented lines are ignored by the parameter `header` but not by `skip_rows`. For example, if ``comment='#'``, parsing '#empty\\na,b,c\\n1,2,3' with `header=0` will result in 'a,b,c' being treated as the header. encoding : str, default ``None`` @@ -651,24 +651,24 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: The presence of ignored lines might create ambiguities involving line numbers; the parameter ``header`` uses row numbers (ignoring commented/empty - lines), while ``skiprows`` uses line numbers (including commented/empty lines): + lines), while ``skip_rows`` uses line numbers (including commented/empty lines): .. ipython:: python data = '#comment\na,b,c\nA,B,C\n1,2,3' pd.read_csv(StringIO(data), comment='#', header=1) data = 'A,B,C\n#comment\na,b,c\n1,2,3' - pd.read_csv(StringIO(data), comment='#', skiprows=2) + pd.read_csv(StringIO(data), comment='#', skip_rows=2) - If both ``header`` and ``skiprows`` are specified, ``header`` will be - relative to the end of ``skiprows``. For example: + If both ``header`` and ``skip_rows`` are specified, ``header`` will be + relative to the end of ``skip_rows``. For example: .. ipython:: python data = '# empty\n# second empty line\n# third empty' \ 'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0' print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + pd.read_csv(StringIO(data), comment='#', skip_rows=4, header=1) .. _io.comments: @@ -2373,14 +2373,14 @@ Specify a number of rows to skip: .. code-block:: python - dfs = pd.read_html(url, skiprows=0) + dfs = pd.read_html(url, skip_rows=0) Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works as well): .. code-block:: python - dfs = pd.read_html(url, skiprows=range(2)) + dfs = pd.read_html(url, skip_rows=range(2)) Specify an HTML attribute: From f1c4f26878c27651a9b96bc4da4082cb7beefdd2 Mon Sep 17 00:00:00 2001 From: palewire Date: Thu, 16 Aug 2018 04:38:18 -0700 Subject: [PATCH 3/4] Linter fixes --- pandas/io/html.py | 6 +++++- pandas/tests/io/test_html.py | 12 ++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 0b8dd0aa5b31a..d4999d92ae0b3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -104,7 +104,11 @@ def _get_skiprows(skip_rows): A proper iterator to use to skip rows of a DataFrame. """ if isinstance(skip_rows, slice): - return lrange(skip_rows.start or 0, skip_rows.stop, skip_rows.step or 1) + return lrange( + skip_rows.start or 0, + skip_rows.stop, + skip_rows.step or 1 + ) elif isinstance(skip_rows, numbers.Integral) or is_list_like(skip_rows): return skip_rows elif skip_rows is None: diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9a92db5449112..e1889351f38c5 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -159,7 +159,11 @@ def test_skiprows_int(self): assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=range(2))[0] + df1 = self.read_html( + self.spam_data, + '.*Water.*', + skip_rows=range(2) + )[0] df2 = self.read_html(self.spam_data, 'Unit', skip_rows=range(2))[0] tm.assert_frame_equal(df1, df2) @@ -188,7 +192,11 @@ def test_skiprows_slice_short(self): assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skip_rows=slice(2, 5)) + df1 = self.read_html( + self.spam_data, + '.*Water.*', + skip_rows=slice(2, 5) + ) df2 = self.read_html(self.spam_data, 'Unit', skip_rows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) From 825751071803544776ba4c1cf36f0c3528decd6f Mon Sep 17 00:00:00 2001 From: palewire Date: Mon, 3 Sep 2018 13:09:17 -0700 Subject: [PATCH 4/4] Added first run at a deprecation warning --- pandas/io/parsers.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b7bd298834243..8d4be2ab4072b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -617,7 +617,10 @@ def parser_f(filepath_or_buffer, delim_whitespace=False, low_memory=_c_parser_defaults['low_memory'], memory_map=False, - float_precision=None): + float_precision=None, + + # Deprecated with warnings + skiprows=None): # deprecate read_table GH21948 if name == "read_table": @@ -647,6 +650,15 @@ def parser_f(filepath_or_buffer, engine = 'c' engine_specified = False + # Handle deprecated kwargs + if skiprows: + warnings.warn( + "skiprows will be deprecated. Use skip_rows instead.", + FutureWarning + ) + if not skip_rows: + skip_rows = skiprows + kwds = dict(delimiter=delimiter, engine=engine, dialect=dialect,