From 6bf83c5dc575f52c84783d6bd6c4b9713b6201ab Mon Sep 17 00:00:00 2001 From: Scott E Lasley Date: Fri, 7 Nov 2014 15:31:53 -0500 Subject: [PATCH] BUG CSV: fix problem with trailing whitespace in skipped rows, issues 8661, 8679 ENH CSV: Reduce memory usage when skiprows is an integer in read_csv, issue 8681 --- doc/source/whatsnew/v0.15.2.txt | 2 + pandas/io/tests/test_parsers.py | 23 ++++++++ pandas/parser.pyx | 12 ++-- pandas/src/parser/tokenizer.c | 101 ++++++++++++++++++++++++++------ pandas/src/parser/tokenizer.h | 4 ++ vb_suite/io_bench.py | 16 +++++ 6 files changed, 135 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index d6d36fd8d14ba..1e84762b60caa 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -74,6 +74,7 @@ Enhancements Performance ~~~~~~~~~~~ +- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`) .. _whatsnew_0152.experimental: @@ -155,3 +156,4 @@ Bug Fixes of the level names are numbers (:issue:`8584`). - Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is not lexically sorted or unique (:issue:`7724`) +- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 228dad984bb3c..59647b4c781e5 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3048,6 +3048,29 @@ def test_comment_skiprows(self): df = self.read_csv(StringIO(data), comment='#', skiprows=4) tm.assert_almost_equal(df.values, expected) + def test_trailing_spaces(self): + data = """skip +random line with trailing spaces +skip +1,2,3 +1,2.,4. +random line with trailing tabs\t\t\t + +5.,NaN,10.0 +""" + expected = pd.DataFrame([[1., 2., 4.], + [5., np.nan, 10.]]) + # this should ignore six lines including lines with trailing + # whitespace and blank lines. issues 8661, 8679 + df = self.read_csv(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0,1,2,3,5,6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + df = self.read_table(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0,1,2,3,5,6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + def test_comment_header(self): data = """# empty # second empty line diff --git a/pandas/parser.pyx b/pandas/parser.pyx index afaa5219ab0cd..0409ee56f22bb 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -86,6 +86,7 @@ cdef extern from "parser/tokenizer.h": EAT_COMMENT EAT_LINE_COMMENT WHITESPACE_LINE + SKIP_LINE FINISHED enum: ERROR_OVERFLOW @@ -158,6 +159,7 @@ cdef extern from "parser/tokenizer.h": int header_end # header row end void *skipset + int64_t skip_first_N_rows int skip_footer double (*converter)(const char *, char **, char, char, char, int) @@ -181,6 +183,8 @@ cdef extern from "parser/tokenizer.h": void parser_free(parser_t *self) nogil int parser_add_skiprow(parser_t *self, int64_t row) + int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) + void parser_set_default_options(parser_t *self) int parser_consume_rows(parser_t *self, size_t nrows) @@ -524,10 +528,10 @@ cdef class TextReader: cdef _make_skiprow_set(self): if isinstance(self.skiprows, (int, np.integer)): - self.skiprows = range(self.skiprows) - - for i in self.skiprows: - parser_add_skiprow(self.parser, i) + parser_set_skipfirstnrows(self.parser, self.skiprows) + else: + for i in self.skiprows: + parser_add_skiprow(self.parser, i) cdef _setup_parser_source(self, source): cdef: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 9a7303b6874db..fc96cc5429775 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -156,6 +156,7 @@ void parser_set_default_options(parser_t *self) { self->thousands = '\0'; self->skipset = NULL; + self-> skip_first_N_rows = -1; self->skip_footer = 0; } @@ -444,21 +445,17 @@ static int end_line(parser_t *self) { } } - if (self->skipset != NULL) { - k = kh_get_int64((kh_int64_t*) self->skipset, self->file_lines); - - if (k != ((kh_int64_t*)self->skipset)->n_buckets) { - TRACE(("Skipping row %d\n", self->file_lines)); - // increment file line count - self->file_lines++; - - // skip the tokens from this bad line - self->line_start[self->lines] += fields; + if (self->state == SKIP_LINE) { + TRACE(("Skipping row %d\n", self->file_lines)); + // increment file line count + self->file_lines++; + + // skip the tokens from this bad line + self->line_start[self->lines] += fields; - // reset field count - self->line_fields[self->lines] = 0; - return 0; - } + // reset field count + self->line_fields[self->lines] = 0; + return 0; } /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ @@ -556,6 +553,15 @@ int parser_add_skiprow(parser_t *self, int64_t row) { return 0; } +int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { + // self->file_lines is zero based so subtract 1 from nrows + if (nrows > 0) { + self->skip_first_N_rows = nrows - 1; + } + + return 0; +} + static int parser_buffer_bytes(parser_t *self, size_t nbytes) { int status; size_t bytes_read; @@ -656,6 +662,15 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit); TRACE(("datapos: %d, datalen: %d\n", self->datapos, self->datalen)); +int skip_this_line(parser_t *self, int64_t rownum) { + if (self->skipset != NULL) { + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + ((kh_int64_t*)self->skipset)->n_buckets ); + } + else { + return ( rownum <= self->skip_first_N_rows ); + } +} int tokenize_delimited(parser_t *self, size_t line_limit) { @@ -688,10 +703,25 @@ int tokenize_delimited(parser_t *self, size_t line_limit) switch(self->state) { + case SKIP_LINE: +// TRACE(("tokenize_delimited SKIP_LINE %c, state %d\n", c, self->state)); + if (c == '\n') { + END_LINE(); + } + break; + case START_RECORD: // start of record - - if (c == '\n') { + if (skip_this_line(self, self->file_lines)) { + if (c == '\n') { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } + else if (c == '\n') { // \n\r possible? if (self->skip_empty_lines) { @@ -1006,9 +1036,26 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + + case SKIP_LINE: +// TRACE(("tokenize_delim_customterm SKIP_LINE %c, state %d\n", c, self->state)); + if (c == self->lineterminator) { + END_LINE(); + } + break; + case START_RECORD: // start of record - if (c == self->lineterminator) { + if (skip_this_line(self, self->file_lines)) { + if (c == self->lineterminator) { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } + else if (c == self->lineterminator) { // \n\r possible? if (self->skip_empty_lines) { @@ -1252,6 +1299,14 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + + case SKIP_LINE: +// TRACE(("tokenize_whitespace SKIP_LINE %c, state %d\n", c, self->state)); + if (c == '\n') { + END_LINE(); + } + break; + case WHITESPACE_LINE: if (c == '\n') { self->file_lines++; @@ -1283,9 +1338,17 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) case START_RECORD: // start of record - if (c == '\n') { - // \n\r possible? + if (skip_this_line(self, self->file_lines)) { + if (c == '\n') { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } else if (c == '\n') { if (self->skip_empty_lines) + // \n\r possible? { self->file_lines++; } diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 0947315fbe6b7..07f4153038dd8 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -127,6 +127,7 @@ typedef enum { EAT_COMMENT, EAT_LINE_COMMENT, WHITESPACE_LINE, + SKIP_LINE, FINISHED } ParserState; @@ -203,6 +204,7 @@ typedef struct parser_t { int header_end; // header row end void *skipset; + int64_t skip_first_N_rows; int skip_footer; double (*converter)(const char *, char **, char, char, char, int); @@ -240,6 +242,8 @@ int parser_trim_buffers(parser_t *self); int parser_add_skiprow(parser_t *self, int64_t row); +int parser_set_skipfirstnrows(parser_t *self, int64_t nrows); + void parser_free(parser_t *self); void parser_set_default_options(parser_t *self); diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py index 0b9f68f0e6ed5..a70c543ca59eb 100644 --- a/vb_suite/io_bench.py +++ b/vb_suite/io_bench.py @@ -21,6 +21,22 @@ read_csv_standard = Benchmark("read_csv('__test__.csv')", setup1, start_date=datetime(2011, 9, 15)) +#---------------------------------- +# skiprows + +setup1 = common_setup + """ +index = tm.makeStringIndex(20000) +df = DataFrame({'float1' : randn(20000), + 'float2' : randn(20000), + 'string1' : ['foo'] * 20000, + 'bool1' : [True] * 20000, + 'int1' : np.random.randint(0, 200000, size=20000)}, + index=index) +df.to_csv('__test__.csv') +""" + +read_csv_skiprows = Benchmark("read_csv('__test__.csv', skiprows=10000)", setup1, + start_date=datetime(2011, 9, 15)) #---------------------------------------------------------------------- # write_csv