From de27eef23acf377c1eed1c117765ff4bf81a6d2f Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 10 May 2013 21:39:40 -0400 Subject: [PATCH 1/7] ENH: to_csv write multi-index columns similar to how they are displayed in to_string --- pandas/core/format.py | 39 ++++++++++++++++++++++++++++++++++---- pandas/tests/test_frame.py | 12 +++++++++++- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index bea4b59bfaaa4..285d50373d811 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -959,9 +959,12 @@ def _save_header(self): index_label = self.index_label cols = self.cols header = self.header + has_mi_columns = isinstance(obj.columns, MultiIndex) + encoded_labels = [] has_aliases = isinstance(header, (tuple, list, np.ndarray)) if has_aliases or self.header: + if self.index: # should write something for index label if index_label is not False: @@ -994,12 +997,40 @@ def _save_header(self): write_cols = header else: write_cols = cols - encoded_cols = list(write_cols) - writer.writerow(encoded_labels + encoded_cols) + if not has_mi_columns: + encoded_labels += list(write_cols) + else: - encoded_cols = list(cols) - writer.writerow(encoded_cols) + + if not has_mi_columns: + encoded_labels += list(cols) + + # write out the mi + if has_mi_columns: + columns = obj.columns + + # write out the names for each level, then ALL of the values for each level + for i in range(columns.nlevels): + + # name is the first column + col_line = [ columns.names[i] ] + + # skipp len labels-1 + if self.index and isinstance(index_label,list) and len(index_label)>1: + col_line.extend([ '' ] * (len(index_label)-1)) + + for j in range(len(columns)): + col_line.append(columns.levels[i][j]) + + writer.writerow(col_line) + + # add blanks for the columns, so that we + # have consistent seps + encoded_labels.extend([ '' ] * len(columns)) + + # write out the index label line + writer.writerow(encoded_labels) def _save(self): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 8e48ef094c419..d8eb2748dda29 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4962,6 +4962,7 @@ def test_to_csv_multiindex(self): frame.index = new_index with ensure_clean(pname) as path: + frame.to_csv(path, header=False) frame.to_csv(path, cols=['A', 'B']) @@ -4973,7 +4974,7 @@ def test_to_csv_multiindex(self): self.assertEqual(frame.index.names, df.index.names) self.frame.index = old_index # needed if setUP becomes a classmethod - # try multiindex with dates + # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] @@ -4994,6 +4995,15 @@ def test_to_csv_multiindex(self): assert_almost_equal(recons.values, self.tsframe.values) self.tsframe.index = old_index # needed if setUP becomes classmethod + with ensure_clean(pname) as path: + # column & index are mi + import pdb; pdb.set_trace() + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv(path) + + result = pd.read_csv(path,header=[0,1,2,3],index_col=[0,1]) + + with ensure_clean(pname) as path: # empty tsframe[:0].to_csv(path) From cc93d614eaa3e3c46daf340a4aae58b56a0fa226 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 10 May 2013 23:47:53 -0400 Subject: [PATCH 2/7] ENH: Allow read_csv to handle multi-index in columns GH3571, GH1651, GH3141 --- pandas/core/format.py | 69 +++++++++---------- pandas/io/parsers.py | 53 +++++++++++---- pandas/io/tests/test_cparser.py | 2 +- pandas/src/parser.pyx | 113 ++++++++++++++++++++------------ pandas/src/parser/tokenizer.c | 4 +- pandas/src/parser/tokenizer.h | 2 + pandas/tests/test_frame.py | 14 ++-- 7 files changed, 162 insertions(+), 95 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 285d50373d811..2eaa17bc659c3 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -963,48 +963,49 @@ def _save_header(self): encoded_labels = [] has_aliases = isinstance(header, (tuple, list, np.ndarray)) - if has_aliases or self.header: + if not (has_aliases or self.header): + return - if self.index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(obj.index, MultiIndex): - index_label = [] - for i, name in enumerate(obj.index.names): - if name is None: - name = '' - index_label.append(name) + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [''] else: - index_label = obj.index.name - if index_label is None: - index_label = [''] - else: - index_label = [index_label] - elif not isinstance(index_label, (list, tuple, np.ndarray)): - # given a string for a DF with Index - index_label = [index_label] + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] - encoded_labels = list(index_label) - else: - encoded_labels = [] + encoded_labels = list(index_label) + else: + encoded_labels = [] - if has_aliases: - if len(header) != len(cols): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(header)))) - else: - write_cols = header + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) else: - write_cols = cols + write_cols = header + else: + write_cols = cols - if not has_mi_columns: - encoded_labels += list(write_cols) + if not has_mi_columns: + encoded_labels += list(write_cols) - else: + else: - if not has_mi_columns: - encoded_labels += list(cols) + if not has_mi_columns: + encoded_labels += list(cols) # write out the mi if has_mi_columns: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 044b25041afd9..dca3dfb5e5cec 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -677,10 +677,8 @@ def read(self, nrows=None): if self.options.get('as_recarray'): return ret - index, columns, col_dict = ret - # May alter columns / col_dict - # index, columns, col_dict = self._create_index(col_dict, columns) + index, columns, col_dict = self._create_index(ret) df = DataFrame(col_dict, columns=columns, index=index) @@ -688,8 +686,9 @@ def read(self, nrows=None): return df[df.columns[0]] return df - def _create_index(self, col_dict, columns): - pass + def _create_index(self, ret): + index, columns, col_dict = ret + return index, columns, col_dict def get_chunk(self, size=None): if size is None: @@ -709,6 +708,7 @@ def __init__(self, kwds): self.index_col = kwds.pop('index_col', None) self.index_names = None + self.col_names = None self.parse_dates = kwds.pop('parse_dates', False) self.date_parser = kwds.pop('date_parser', None) @@ -942,7 +942,32 @@ def __init__(self, src, **kwds): if self._reader.header is None: self.names = None else: - self.names = list(self._reader.header) + if len(self._reader.header) > 1: + # the names are the tuples of the header that are not the index cols + # 0 is the name of the index, assuming index_col is a list of column + # numbers + if (self._reader.leading_cols == 0 and + _is_index_col(self.index_col)): + ic = self.index_col + if not isinstance(ic, (list,tuple,np.ndarray)): + ic = [ ic ] + sic = set(ic) + + header = list(self._reader.header) + index_names = header.pop(-1) + self.index_names = [ index_names[i] for i in ic ] + field_count = len(header[0]) + + def extract(r): + return tuple([ r[i] for i in range(field_count) if i not in sic ]) + + self.names = ic + zip(*[ extract(r) for r in header ]) + self.col_names = [ r[0] if len(r[0]) else None for r in header ] + passed_names = True + else: + raise Exception("must have an index_col when have a multi-index specified") + else: + self.names = list(self._reader.header[0]) if self.names is None: if self.prefix: @@ -958,12 +983,14 @@ def __init__(self, src, **kwds): if not self._has_complex_date_col: if (self._reader.leading_cols == 0 and - _is_index_col(self.index_col)): + _is_index_col(self.index_col)): self._name_processed = True - (self.index_names, self.names, - self.index_col) = _clean_index_names(self.names, - self.index_col) + (index_names, self.names, + self.index_col) = _clean_index_names(self.names, self.index_col) + + if self.index_names is None: + self.index_names = index_names if self._reader.header is None and not passed_names: self.index_names = [None] * len(self.index_names) @@ -1051,6 +1078,10 @@ def read(self, nrows=None): names, data = self._do_date_conversions(names, data) index = self._make_index(data, alldata, names) + # possibly create a column mi here + if all([ isinstance(c,tuple) for c in names]): + names = MultiIndex.from_tuples(names,names=self.col_names) + return index, names, data def _filter_usecols(self, names): @@ -1061,7 +1092,7 @@ def _filter_usecols(self, names): return names def _get_index_names(self): - names = list(self._reader.header) + names = list(self._reader.header[0]) idx_names = None if self._reader.leading_cols == 0 and self.index_col is not None: diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index b352b189a74b8..0c5b168ee8de5 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -179,7 +179,7 @@ def test_header_not_enough_lines(self): reader = TextReader(StringIO(data), delimiter=',', header=2, as_recarray=True) header = reader.header - expected = ['a', 'b', 'c'] + expected = [['a', 'b', 'c']] self.assertEquals(header, expected) recs = reader.read() diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index 694a769641b0d..97e31515bec78 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h": char thousands int header # Boolean: 1: has header, 0: no header + int header_start # header row start + int header_end # header row end void *skipset int skip_footer @@ -242,7 +244,7 @@ cdef class TextReader: object na_values, true_values, false_values object memory_map object as_recarray - object header, names + object header, names, header_start, header_end object low_memory object skiprows object compact_ints, use_unsigned @@ -256,6 +258,8 @@ cdef class TextReader: delimiter=b',', header=0, + header_start=0, + header_end=0, names=None, memory_map=False, @@ -435,11 +439,28 @@ cdef class TextReader: # TODO: no header vs. header is not the first row if header is None: # sentinel value + self.parser.header_start = -1 + self.parser.header_end = -1 self.parser.header = -1 self.parser_start = 0 + self.header = [] else: - self.parser.header = header - self.parser_start = header + 1 + if isinstance(header, list) and len(header): + # need to artifically skip the final line + # which is still a header line + header.append(header[-1]+1) + + self.parser.header_start = header[0] + self.parser.header_end = header[-1] + self.parser.header = header[0] + self.parser_start = header[-1] + 1 + self.header = header + else: + self.parser.header_start = header + self.parser.header_end = header + self.parser.header = header + self.parser_start = header + 1 + self.header = [ header ] self.names = names self.header, self.table_width = self._get_header() @@ -534,8 +555,10 @@ cdef class TextReader: ' got %s type' % type(source)) cdef _get_header(self): + # header is now a list of lists, so field_count should use header[0] + cdef: - size_t i, start, data_line, field_count, passed_count + size_t i, start, data_line, field_count, passed_count, hr char *word object name int status @@ -544,49 +567,53 @@ cdef class TextReader: header = [] - if self.parser.header >= 0: - # Header is in the file + if self.parser.header_start >= 0: - if self.parser.lines < self.parser.header + 1: - self._tokenize_rows(self.parser.header + 2) - - # e.g., if header=3 and file only has 2 lines - if self.parser.lines < self.parser.header + 1: - raise CParserError('Passed header=%d but only %d lines in file' - % (self.parser.header, self.parser.lines)) + # Header is in the file + for hr in self.header: - field_count = self.parser.line_fields[self.parser.header] - start = self.parser.line_start[self.parser.header] + this_header = [] - # TODO: Py3 vs. Py2 - counts = {} - for i in range(field_count): - word = self.parser.words[start + i] + if self.parser.lines < hr + 1: + self._tokenize_rows(hr + 2) - if self.c_encoding == NULL and not PY3: - name = PyBytes_FromString(word) - else: - if self.c_encoding == NULL or self.c_encoding == b'utf-8': - name = PyUnicode_FromString(word) - else: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + # e.g., if header=3 and file only has 2 lines + if self.parser.lines < hr + 1: + raise CParserError('Passed header=%d but only %d lines in file' + % (self.parser.header, self.parser.lines)) - if name == '': - name = 'Unnamed: %d' % i + field_count = self.parser.line_fields[hr] + start = self.parser.line_start[hr] + # TODO: Py3 vs. Py2 + counts = {} + for i in range(field_count): + word = self.parser.words[start + i] - count = counts.get(name, 0) - if count > 0 and self.mangle_dupe_cols: - header.append('%s.%d' % (name, count)) - else: - header.append(name) - counts[name] = count + 1 + if self.c_encoding == NULL and not PY3: + name = PyBytes_FromString(word) + else: + if self.c_encoding == NULL or self.c_encoding == b'utf-8': + name = PyUnicode_FromString(word) + else: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) + + if name == '': + name = 'Unnamed: %d' % i + + count = counts.get(name, 0) + if count > 0 and self.mangle_dupe_cols: + this_header.append('%s.%d' % (name, count)) + else: + this_header.append(name) + counts[name] = count + 1 - data_line = self.parser.header + 1 + data_line = hr + 1 + header.append(this_header) if self.names is not None: - header = self.names + header = [ self.names ] elif self.names is not None: # Enforce this unless usecols @@ -597,11 +624,11 @@ cdef class TextReader: if self.parser.lines < 1: self._tokenize_rows(1) - header = self.names + header = [ self.names ] data_line = 0 if self.parser.lines < 1: - field_count = len(header) + field_count = len(header[0]) else: field_count = self.parser.line_fields[data_line] else: @@ -613,7 +640,7 @@ cdef class TextReader: # Corner case, not enough lines in the file if self.parser.lines < data_line + 1: - field_count = len(header) + field_count = len(header[0]) else: # not self.has_usecols: field_count = self.parser.line_fields[data_line] @@ -622,7 +649,7 @@ cdef class TextReader: if self.names is not None: field_count = max(field_count, len(self.names)) - passed_count = len(header) + passed_count = len(header[0]) # if passed_count > field_count: # raise CParserError('Column names have %d fields, ' @@ -1038,10 +1065,10 @@ cdef class TextReader: if self.header is not None: j = i - self.leading_cols # hack for #2442 - if j == len(self.header): + if j == len(self.header[0]): return j else: - return self.header[j] + return self.header[0][j] else: return None diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 09cddd07e1c1d..81fda37acbb71 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -463,7 +463,7 @@ static int end_line(parser_t *self) { /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ - if (!(self->lines <= self->header + 1) + if (!(self->lines <= self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields)) { // increment file line count self->file_lines++; @@ -498,7 +498,7 @@ static int end_line(parser_t *self) { } else { /* missing trailing delimiters */ - if ((self->lines >= self->header + 1) && fields < ex_fields) { + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { /* Might overrun the buffer when closing fields */ if (make_stream_space(self, ex_fields - fields) < 0) { diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 566e89ae5f9a7..5ba1b99a29d39 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -195,6 +195,8 @@ typedef struct parser_t { char thousands; int header; // Boolean: 1: has header, 0: no header + int header_start; // header row start + int header_end; // header row end void *skipset; int skip_footer; diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d8eb2748dda29..c19de854de130 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4996,13 +4996,19 @@ def test_to_csv_multiindex(self): self.tsframe.index = old_index # needed if setUP becomes classmethod with ensure_clean(pname) as path: - # column & index are mi - import pdb; pdb.set_trace() + # GH3571, GH1651, GH3141 + + # column & index are multi-iindex df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) df.to_csv(path) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1]) + assert_frame_equal(df,result) - result = pd.read_csv(path,header=[0,1,2,3],index_col=[0,1]) - + # column is mi + df = mkdf(5,3,r_idx_nlevels=1,c_idx_nlevels=4) + df.to_csv(path) + result = read_csv(path,header=[0,1,2,3],index_col=0) + assert_frame_equal(df,result) with ensure_clean(pname) as path: # empty From c64555b006fe545d6e4542667ddf31ff91275274 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 11 May 2013 09:31:54 -0400 Subject: [PATCH 3/7] TST: more test cases ENH: catching some invalid option combinations BUG: fix as_recarray DOC: io.rst updated --- doc/source/io.rst | 20 +++++++++++++++++++- pandas/io/parsers.py | 22 +++++++++++++++++++--- pandas/io/tests/test_parsers.py | 33 +++++++++++++++++++++++++++++++++ pandas/src/parser.pyx | 3 +++ 4 files changed, 74 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f15f758c42b18..5c0567b21dbf4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -57,7 +57,10 @@ They can take a number of arguments: specified, data types will be inferred. - ``header``: row number to use as the column names, and the start of the data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly - pass ``header=0`` to be able to replace existing names. + pass ``header=0`` to be able to replace existing names. The header can be + a list of integers that specify row locations for a multi-index on the columns + E.g. [0,1,3]. Interveaning rows that are not specified will be skipped. + (E.g. 2 in this example are skipped) - ``skiprows``: A collection of numbers for rows in the file to skip. Can also be an integer to skip the first ``n`` rows - ``index_col``: column number, column name, or list of column numbers/names, @@ -253,6 +256,21 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. _io.multi_index_columns: + +Specifying a multi-index columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By specifying list of row locations for the ``header`` argument, you +can read in a multi-index for the columns. Specifying non-consecutive +rows will skip the interveaing rows. The ``index_col`` must also be +specified. + +.. ipython:: python + + data = 'C0,C_l0_g0,C_l0_g1\nC1,C_l1_g0,C_l1_g1\nR0,,\nR_l0_g0,R0C0,R0C1\nR_l0_g1,R1C0,R1C1\nR_l0_g2,R2C0,R2C1\n' + pd.read_csv(StringIO(data), header=[0,1], index_col=[0]) + .. _io.usecols: Filtering columns (``usecols``) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dca3dfb5e5cec..f380da680eca6 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -52,9 +52,11 @@ class DateConversionError(Exception): dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details -header : int, default 0 if names parameter not specified, otherwise None +header : int, default 0 if names parameter not specified, Row to use for the column labels of the parsed DataFrame. Specify None if - there is no header row. + there is no header row. Can be a list of integers that specify row + locations for a multi-index on the columns E.g. [0,1,3]. Interveaning + rows that are not specified (E.g. 2 in this example are skipped) skiprows : list-like or integer Row numbers to skip (0-indexed) or number of rows to skip (int) at the start of the file @@ -531,6 +533,16 @@ def __init__(self, f, engine='python', **kwds): if kwds.get('header', 'infer') == 'infer': kwds['header'] = 0 if kwds.get('names') is None else None + # validate header options for mi + h = kwds['header'] + if isinstance(h,(list,tuple,np.ndarray)): + if kwds.get('index_col') is None: + raise Exception("must have an index_col when have a " + "multi-index header is specified") + if kwds.get('as_recarray'): + raise Exception("cannot specify as_recarray when " + "specifying a multi-index header") + self.orig_options = kwds # miscellanea @@ -965,7 +977,8 @@ def extract(r): self.col_names = [ r[0] if len(r[0]) else None for r in header ] passed_names = True else: - raise Exception("must have an index_col when have a multi-index specified") + raise Exception("must have an index_col when have a multi-index " + "header is specified") else: self.names = list(self._reader.header[0]) @@ -1381,6 +1394,9 @@ def _infer_columns(self): names = self.names if self.header is not None: + if isinstance(self.header,(list,tuple,np.ndarray)): + raise Exception("PythonParser does not support a multi-index header") + if len(self.buf) > 0: line = self.buf[0] else: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 38a31c042d120..bd55f9c74922c 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -20,6 +20,7 @@ TextFileReader, TextParser) from pandas.util.testing import (assert_almost_equal, assert_series_equal, + makeCustomDataframe as mkdf, network, ensure_clean) import pandas.util.testing as tm @@ -994,6 +995,38 @@ def test_header_not_first_line(self): expected = self.read_csv(StringIO(data2), header=0, index_col=0) tm.assert_frame_equal(df, expected) + def test_header_multi_index(self): + expected = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + + # python-engine + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], engine='python') + + # must specify index_col + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3]) + + # no as_recarray + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], as_recarray=True) + + # skipping lines in the header + df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1]) + tm.assert_frame_equal(df, expected) + def test_pass_names_with_index(self): lines = self.data1.split('\n') no_header = '\n'.join(lines[1:]) diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index 97e31515bec78..01b600c975cb9 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -1789,6 +1789,9 @@ def _to_structured_array(dict columns, object names): if names is None: names = ['%d' % i for i in range(len(columns))] + else: + # single line header + names = names[0] dt = np.dtype([(str(name), columns[i].dtype) for i, name in enumerate(names)]) From d6573f536e055c498563901048e4736c01fc50b2 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 13 May 2013 08:34:29 -0400 Subject: [PATCH 4/7] ENH/CLN: refactor to support PythonParser as well as CParser --- pandas/io/parsers.py | 153 +++++++++++++++++++++----------- pandas/io/tests/test_parsers.py | 7 +- pandas/tests/test_frame.py | 17 ++-- 3 files changed, 108 insertions(+), 69 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f380da680eca6..e2f4a59b24c87 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -755,7 +755,42 @@ def _should_parse_dates(self, i): else: return (j in self.parse_dates) or (name in self.parse_dates) - def _make_index(self, data, alldata, columns): + + def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False): + """ extract and return the names, index_names, col_names + header is a list-of-lists returned from the parsers """ + if len(header) < 2: + return header[0], index_names, col_names, passed_names + + # the names are the tuples of the header that are not the index cols + # 0 is the name of the index, assuming index_col is a list of column + # numbers + ic = self.index_col + if not isinstance(ic, (list,tuple,np.ndarray)): + ic = [ ic ] + sic = set(ic) + + orig_header = list(header) + index_names = header.pop(-1) + index_names = [ index_names[i] for i in ic ] + field_count = len(header[0]) + + def extract(r): + return tuple([ r[i] for i in range(field_count) if i not in sic ]) + + names = ic + zip(*[ extract(r) for r in header ]) + col_names = [ r[0] if len(r[0]) else None for r in header ] + passed_names = True + + return names, index_names, col_names, passed_names + + def _maybe_make_multi_index_columns(self, columns, col_names=None): + # possibly create a column mi here + if len(columns) and not isinstance(columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]): + columns = MultiIndex.from_tuples(columns,names=col_names) + return columns + + def _make_index(self, data, alldata, columns, indexnamerow=False): if not _is_index_col(self.index_col) or len(self.index_col) == 0: index = None @@ -772,7 +807,15 @@ def _make_index(self, data, alldata, columns): index = self._get_complex_date_index(data, columns) index = self._agg_index(index, try_parse_dates=False) - return index + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index.names = indexnamerow[:coffset] + + # maybe create a mi on the columns + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + return index, columns _implicit_index = False @@ -955,27 +998,11 @@ def __init__(self, src, **kwds): self.names = None else: if len(self._reader.header) > 1: - # the names are the tuples of the header that are not the index cols - # 0 is the name of the index, assuming index_col is a list of column - # numbers + # we have a multi index in the columns if (self._reader.leading_cols == 0 and _is_index_col(self.index_col)): - ic = self.index_col - if not isinstance(ic, (list,tuple,np.ndarray)): - ic = [ ic ] - sic = set(ic) - - header = list(self._reader.header) - index_names = header.pop(-1) - self.index_names = [ index_names[i] for i in ic ] - field_count = len(header[0]) - - def extract(r): - return tuple([ r[i] for i in range(field_count) if i not in sic ]) - - self.names = ic + zip(*[ extract(r) for r in header ]) - self.col_names = [ r[0] if len(r[0]) else None for r in header ] - passed_names = True + self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, passed_names) else: raise Exception("must have an index_col when have a multi-index " "header is specified") @@ -1089,11 +1116,10 @@ def read(self, nrows=None): data = dict((k, v) for k, (i, v) in zip(names, data)) names, data = self._do_date_conversions(names, data) - index = self._make_index(data, alldata, names) + index, names = self._make_index(data, alldata, names) - # possibly create a column mi here - if all([ isinstance(c,tuple) for c in names]): - names = MultiIndex.from_tuples(names,names=self.col_names) + # maybe create a mi on the columns + names = self._maybe_make_multi_index_columns(names, self.col_names) return index, names, data @@ -1252,6 +1278,13 @@ def __init__(self, f, **kwds): self.data = f self.columns = self._infer_columns() + # we are processing a multi index column + if len(self.columns) > 1: + self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names) + else: + self.columns = self.columns[0] + # get popped off for index self.orig_names = list(self.columns) @@ -1259,9 +1292,11 @@ def __init__(self, f, **kwds): # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: - (self.index_names, + (index_names, self.orig_names, _) = self._get_index_name(self.columns) self._name_processed = True + if self.index_names is None: + self.index_names = index_names self._first_chunk = True def _make_reader(self, f): @@ -1365,10 +1400,7 @@ def read(self, rows=None): columns, data = self._do_date_conversions(self.columns, data) data = self._convert_data(data) - index = self._make_index(data, alldata, columns) - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - index.names = indexnamerow[:coffset] + index, columns = self._make_index(data, alldata, columns, indexnamerow) return index, columns, data @@ -1394,39 +1426,52 @@ def _infer_columns(self): names = self.names if self.header is not None: - if isinstance(self.header,(list,tuple,np.ndarray)): - raise Exception("PythonParser does not support a multi-index header") + header = self.header - if len(self.buf) > 0: - line = self.buf[0] + # we have a mi columns, so read and extra line + if isinstance(header,(list,tuple,np.ndarray)): + header = list(header) + [header[-1]+1] else: - line = self._next_line() - - while self.pos <= self.header: - line = self._next_line() + header = [ header ] columns = [] - for i, c in enumerate(line): - if c == '': - columns.append('Unnamed: %d' % i) + for hr in header: + + if len(self.buf) > 0: + line = self.buf[0] else: - columns.append(c) + line = self._next_line() - if self.mangle_dupe_cols: - counts = {} - for i, col in enumerate(columns): - cur_count = counts.get(col, 0) - if cur_count > 0: - columns[i] = '%s.%d' % (col, cur_count) - counts[col] = cur_count + 1 + while self.pos <= hr: + line = self._next_line() + + this_columns = [] + for i, c in enumerate(line): + if c == '': + this_columns.append('Unnamed: %d' % i) + else: + this_columns.append(c) + + if self.mangle_dupe_cols: + counts = {} + for i, col in enumerate(this_columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + this_columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 + + columns.append(this_columns) self._clear_buffer() if names is not None: - if len(names) != len(columns): + if len(names) != len(columns[0]): raise Exception('Number of passed names did not match ' 'number of header fields in the file') - columns = names + if len(columns) > 1: + raise Exception('Cannot pass names with multi-index columns') + columns = [ names ] + else: if len(self.buf) > 0: line = self.buf[0] @@ -1436,11 +1481,11 @@ def _infer_columns(self): ncols = len(line) if not names: if self.prefix: - columns = ['X%d' % i for i in range(ncols)] + columns = [ ['X%d' % i for i in range(ncols)] ] else: - columns = range(ncols) + columns = [ range(ncols) ] else: - columns = names + columns = [ names ] return columns diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index bd55f9c74922c..b9e773a916d4c 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1012,9 +1012,10 @@ def test_header_multi_index(self): R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - # python-engine - self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], - index_col=[0,1], engine='python') + # basic test with both engines + for engine in ['c','python']: + df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], engine=engine) + tm.assert_frame_equal(df, expected) # must specify index_col self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3]) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c19de854de130..616fdd5ca2549 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4755,9 +4755,13 @@ def test_to_csv_moar(self): def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None, dupe_col=False): + header = 0 + if cnlvl: + header = range(cnlvl) + with ensure_clean(path) as path: df.to_csv(path,encoding='utf8',chunksize=chunksize) - recons = DataFrame.from_csv(path,parse_dates=False) + recons = DataFrame.from_csv(path,header=header,parse_dates=False) def _to_uni(x): if not isinstance(x,unicode): @@ -4773,16 +4777,6 @@ def _to_uni(x): recons.index = ix recons = recons.iloc[:,rnlvl-1:] - if cnlvl: - def stuple_to_tuple(x): - import re - x = x.split(",") - x = map(lambda x: re.sub("[\'\"\s\(\)]","",x),x) - return x - - cols=MultiIndex.from_tuples(map(stuple_to_tuple,recons.columns)) - recons.columns = cols - type_map = dict(i='i',f='f',s='O',u='O',dt='O',p='O') if r_dtype: if r_dtype == 'u': # unicode @@ -4827,7 +4821,6 @@ def stuple_to_tuple(x): assert_frame_equal(df, recons,check_names=False,check_less_precise=True) - N = 100 chunksize=1000 From b0dadc5c3d1407e913f797d554241780f2d8a830 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 13 May 2013 12:55:06 -0400 Subject: [PATCH 5/7] BUG: unnamed columns in a multi-index will be named like: Unamed 2_level_0, so they are not duplicated ENH: add options ``multi_index_columns_compat`` both to to_csv and read_csv (default is False), to force (when True) the previous behavior of creating a list of tuples (when writing), and reading as a list of tuples (and NOT as a MultiIndex) DOC: add compat flags to io.rst --- doc/source/io.rst | 7 +++++++ pandas/core/format.py | 11 +++++++---- pandas/core/frame.py | 9 +++++++-- pandas/io/parsers.py | 24 +++++++++++++++++++----- pandas/src/parser.pyx | 16 ++++++++++++---- pandas/tests/test_frame.py | 18 +++++++++++++++++- 6 files changed, 69 insertions(+), 16 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 5c0567b21dbf4..ef223f64d43c8 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -115,6 +115,10 @@ They can take a number of arguments: - ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines ` - ``usecols``: a subset of columns to return, results in much faster parsing time and lower memory usage. + - ``mangle_dup_columns``: boolean, default True, then duplicate columns will be specified + as 'X.0'...'X.N', rather than 'X'...'X' + - ``multi_index_columns_compat``: boolean, default False, leave a list of tuples on columns + as is (default is to convert to a Multi Index on the columns) .. ipython:: python :suppress: @@ -271,6 +275,9 @@ specified. data = 'C0,C_l0_g0,C_l0_g1\nC1,C_l1_g0,C_l1_g1\nR0,,\nR_l0_g0,R0C0,R0C1\nR_l0_g1,R1C0,R1C1\nR_l0_g2,R2C0,R2C1\n' pd.read_csv(StringIO(data), header=[0,1], index_col=[0]) +You can pass ``multi_index_columns_compat=True`` to preserve the pre-0.12 behavior of +not converting a list of tuples in the columns to a Multi Index. + .. _io.usecols: Filtering columns (``usecols``) diff --git a/pandas/core/format.py b/pandas/core/format.py index 2eaa17bc659c3..e7ac540343d84 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -772,9 +772,10 @@ def grouper(x): class CSVFormatter(object): def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, - cols=None, header=True, index=True, index_label=None, - mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n', chunksize=None, engine=None): + cols=None, header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None, quoting=None, + line_terminator='\n', chunksize=None, engine=None, + multi_index_columns_compat=False): self.engine = engine # remove for 0.12 @@ -803,6 +804,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, msg= "columns.is_unique == False not supported with engine='python'" raise NotImplementedError(msg) + self.multi_index_columns_compat=multi_index_columns_compat if cols is not None: if isinstance(cols,Index): cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) @@ -959,7 +961,8 @@ def _save_header(self): index_label = self.index_label cols = self.cols header = self.header - has_mi_columns = isinstance(obj.columns, MultiIndex) + has_mi_columns = isinstance(obj.columns, MultiIndex + ) and not self.multi_index_columns_compat encoded_labels = [] has_aliases = isinstance(header, (tuple, list, np.ndarray)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39742557ccc56..bb7416b23aab4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1391,7 +1391,8 @@ def to_panel(self): def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n', chunksize=None,**kwds): + line_terminator='\n', chunksize=None, + multi_index_columns_compat=False, **kwds): """ Write DataFrame to a comma-separated values (csv) file @@ -1429,6 +1430,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, quoting : optional constant from csv module defaults to csv.QUOTE_MINIMAL chunksize : rows to write at a time + multi_index_columns_compat : boolean, default False + write multi_index columns as a list of tuples (if True) + or new (expanded format)m if False) """ if nanRep is not None: # pragma: no cover import warnings @@ -1445,7 +1449,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, float_format=float_format, cols=cols, header=header, index=index, index_label=index_label,mode=mode, - chunksize=chunksize,engine=kwds.get("engine") ) + chunksize=chunksize,engine=kwds.get("engine"), + multi_index_columns_compat=multi_index_columns_compat) formatter.save() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e2f4a59b24c87..78a941218c1d6 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -127,6 +127,11 @@ class DateConversionError(Exception): usecols : array-like Return a subset of the columns. Results in much faster parsing time and lower memory usage. +mangle_dup_columns: boolean, default True + Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' +multi_index_columns_compat: boolean, default False + Leave a list of tuples on columns as is (default is to convert to + a Multi Index on the columns) Returns ------- @@ -294,6 +299,7 @@ def _read(filepath_or_buffer, kwds): 'squeeze': False, 'compression': None, 'mangle_dupe_cols': True, + 'multi_index_columns_compat':False, } @@ -380,7 +386,8 @@ def parser_f(filepath_or_buffer, verbose=False, encoding=None, squeeze=False, - mangle_dupe_cols=True + mangle_dupe_cols=True, + multi_index_columns_compat=False, ): # Alias sep -> delimiter. @@ -438,7 +445,7 @@ def parser_f(filepath_or_buffer, error_bad_lines=error_bad_lines, low_memory=low_memory, buffer_lines=buffer_lines, - mangle_dupe_cols=mangle_dupe_cols + mangle_dupe_cols=mangle_dupe_cols, ) return _read(filepath_or_buffer, kwds) @@ -730,6 +737,7 @@ def __init__(self, kwds): self.na_values = kwds.get('na_values') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') + self.multi_index_columns_compat = kwds.get('multi_index_columns_compat',False) self._date_conv = _make_date_converter(date_parser=self.date_parser, dayfirst=self.dayfirst) @@ -786,7 +794,8 @@ def extract(r): def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here - if len(columns) and not isinstance(columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]): + if not self.multi_index_columns_compat and len(columns) and not isinstance( + columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]): columns = MultiIndex.from_tuples(columns,names=col_names) return columns @@ -1430,12 +1439,14 @@ def _infer_columns(self): # we have a mi columns, so read and extra line if isinstance(header,(list,tuple,np.ndarray)): + have_mi_columns = True header = list(header) + [header[-1]+1] else: + have_mi_columns = False header = [ header ] columns = [] - for hr in header: + for level, hr in enumerate(header): if len(self.buf) > 0: line = self.buf[0] @@ -1448,7 +1459,10 @@ def _infer_columns(self): this_columns = [] for i, c in enumerate(line): if c == '': - this_columns.append('Unnamed: %d' % i) + if have_mi_columns: + this_columns.append('Unnamed: %d_level_%d' % (i,level)) + else: + this_columns.append('Unnamed: %d' % i) else: this_columns.append(c) diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index 01b600c975cb9..62e9d39cd792d 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -232,7 +232,7 @@ cdef class TextReader: cdef: parser_t *parser object file_handle - bint factorize, na_filter, verbose, has_usecols + bint factorize, na_filter, verbose, has_usecols, has_mi_columns int parser_start list clocks char *c_encoding @@ -252,6 +252,7 @@ cdef class TextReader: object encoding object compression object mangle_dupe_cols + object multi_index_columns_compat set noconvert, usecols def __cinit__(self, source, @@ -304,12 +305,14 @@ cdef class TextReader: skiprows=None, skip_footer=0, verbose=False, - mangle_dupe_cols=True): + mangle_dupe_cols=True, + multi_index_columns_compat=False): self.parser = parser_new() self.parser.chunksize = tokenize_chunksize self.mangle_dupe_cols=mangle_dupe_cols + self.multi_index_columns_compat=multi_index_columns_compat # For timekeeping self.clocks = [] @@ -437,6 +440,7 @@ cdef class TextReader: self.leading_cols = 0 # TODO: no header vs. header is not the first row + self.has_mi_columns = 0 if header is None: # sentinel value self.parser.header_start = -1 @@ -454,6 +458,7 @@ cdef class TextReader: self.parser.header_end = header[-1] self.parser.header = header[0] self.parser_start = header[-1] + 1 + self.has_mi_columns = 1 self.header = header else: self.parser.header_start = header @@ -570,7 +575,7 @@ cdef class TextReader: if self.parser.header_start >= 0: # Header is in the file - for hr in self.header: + for level, hr in enumerate(self.header): this_header = [] @@ -600,7 +605,10 @@ cdef class TextReader: self.c_encoding, errors) if name == '': - name = 'Unnamed: %d' % i + if self.has_mi_columns: + name = 'Unnamed: %d_level_%d' % (i,level) + else: + name = 'Unnamed: %d' % i count = counts.get(name, 0) if count > 0 and self.mangle_dupe_cols: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 616fdd5ca2549..101bdc76ba443 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4991,7 +4991,7 @@ def test_to_csv_multiindex(self): with ensure_clean(pname) as path: # GH3571, GH1651, GH3141 - # column & index are multi-iindex + # column & index are multi-index df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) df.to_csv(path) result = read_csv(path,header=[0,1,2,3],index_col=[0,1]) @@ -5003,6 +5003,22 @@ def test_to_csv_multiindex(self): result = read_csv(path,header=[0,1,2,3],index_col=0) assert_frame_equal(df,result) + # dup column names? + df = mkdf(5,3,r_idx_nlevels=3,c_idx_nlevels=4) + df.to_csv(path) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1]) + result.columns = ['R2','A','B','C'] + new_result = result.reset_index().set_index(['R0','R1','R2']) + new_result.columns = df.columns + assert_frame_equal(df,new_result) + + # column & index are multi-index (compatibility) + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv(path,multi_index_columns_compat=True) + result = read_csv(path,header=0,index_col=[0,1],multi_index_columns_compat=True) + result.columns = df.columns + assert_frame_equal(df,result) + with ensure_clean(pname) as path: # empty tsframe[:0].to_csv(path) From a9a89f89e13cf006f6b58da1747aa65f86f74cfb Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 14 May 2013 16:57:28 -0400 Subject: [PATCH 6/7] DOC: updated releasenotes, v0.11.1 whatsnew, io.rst CLN: changed formatting option: multi_index_columns_compat -> tupleize_cols BUG: incorrectly writing sparse levels for the multi_index DOC: slight docs changes TST: added tests/fixes for dissallowed options in to_csv (cols=not None,index=False) TST: from_csv not accepting tupleize_cols ENH: allow index=False in to_csv with a multi_index column allow reading of a multi_index column with with index_col=None DOC: updates to examples in io.rst and v0.11.1.rst TST: disallow names, usecols, non-numeric in index_cols BUG: raise on too many rows in the header if multi_index of columns --- RELEASE.rst | 15 +++++ doc/source/io.rst | 59 +++++++++++++------- doc/source/v0.11.1.txt | 37 +++++++++++++ pandas/core/format.py | 31 +++++++---- pandas/core/frame.py | 15 +++-- pandas/io/parsers.py | 97 +++++++++++++++++++++------------ pandas/io/tests/test_parsers.py | 26 ++++++--- pandas/src/parser.pyx | 9 +-- pandas/tests/test_frame.py | 76 +++++++++++++++++++++----- 9 files changed, 265 insertions(+), 100 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index acb4f429e81b0..74bafd419af54 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -34,6 +34,15 @@ pandas 0.11.1 courtesy of @cpcloud. (GH3477_) - Support for reading Amazon S3 files. (GH3504_) - Added module for reading and writing Stata files: pandas.io.stata (GH1512_) + - Added support for writing in ``to_csv`` and reading in ``read_csv``, + multi-index columns. The ``header`` option in ``read_csv`` now accepts a + list of the rows from which to read the index. Added the option, + ``tupleize_cols`` to provide compatiblity for the pre 0.11.1 behavior of + writing and reading multi-index columns via a list of tuples. The default in + 0.11.1 is to write lists of tuples and *not* interpret list of tuples as a + multi-index column. + Note: The default value will change in 0.12 to make the default *to* write and + read multi-index columns in the new format. (GH3571_, GH1651_, GH3141_) **Improvements to existing features** @@ -180,6 +189,7 @@ pandas 0.11.1 .. _GH3596: https://github.com/pydata/pandas/issues/3596 .. _GH3617: https://github.com/pydata/pandas/issues/3617 .. _GH3435: https://github.com/pydata/pandas/issues/3435 +<<<<<<< HEAD .. _GH3611: https://github.com/pydata/pandas/issues/3611 .. _GH3062: https://github.com/pydata/pandas/issues/3062 .. _GH3624: https://github.com/pydata/pandas/issues/3624 @@ -187,6 +197,11 @@ pandas 0.11.1 .. _GH3601: https://github.com/pydata/pandas/issues/3601 .. _GH3631: https://github.com/pydata/pandas/issues/3631 .. _GH1512: https://github.com/pydata/pandas/issues/1512 +======= +.. _GH3571: https://github.com/pydata/pandas/issues/3571 +.. _GH1651: https://github.com/pydata/pandas/issues/1651 +.. _GH3141: https://github.com/pydata/pandas/issues/3141 +>>>>>>> DOC: updated releasenotes, v0.11.1 whatsnew, io.rst pandas 0.11.0 diff --git a/doc/source/io.rst b/doc/source/io.rst index ef223f64d43c8..42ea4a2ca5d53 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -115,10 +115,10 @@ They can take a number of arguments: - ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines ` - ``usecols``: a subset of columns to return, results in much faster parsing time and lower memory usage. - - ``mangle_dup_columns``: boolean, default True, then duplicate columns will be specified + - ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' - - ``multi_index_columns_compat``: boolean, default False, leave a list of tuples on columns - as is (default is to convert to a Multi Index on the columns) + - ``tupleize_cols``: boolean, default True, if False, convert a list of tuples + to a multi-index of columns, otherwise, leave the column index as a list of tuples .. ipython:: python :suppress: @@ -260,24 +260,6 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) -.. _io.multi_index_columns: - -Specifying a multi-index columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -By specifying list of row locations for the ``header`` argument, you -can read in a multi-index for the columns. Specifying non-consecutive -rows will skip the interveaing rows. The ``index_col`` must also be -specified. - -.. ipython:: python - - data = 'C0,C_l0_g0,C_l0_g1\nC1,C_l1_g0,C_l1_g1\nR0,,\nR_l0_g0,R0C0,R0C1\nR_l0_g1,R1C0,R1C1\nR_l0_g2,R2C0,R2C1\n' - pd.read_csv(StringIO(data), header=[0,1], index_col=[0]) - -You can pass ``multi_index_columns_compat=True`` to preserve the pre-0.12 behavior of -not converting a list of tuples in the columns to a Multi Index. - .. _io.usecols: Filtering columns (``usecols``) @@ -787,6 +769,36 @@ column numbers to turn multiple columns into a ``MultiIndex``: df df.ix[1978] +.. _io.multi_index_columns: + +Specifying a multi-index columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By specifying list of row locations for the ``header`` argument, you +can read in a multi-index for the columns. Specifying non-consecutive +rows will skip the interveaing rows. + +.. ipython:: python + + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv('mi.csv',tupleize_cols=False) + print open('mi.csv').read() + pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + +Note: The default behavior in 0.11.1 remains unchanged (``tupleize_cols=True``), +but starting with 0.12, the default *to* write and read multi-index columns will be in the new +format (``tupleize_cols=False``) + +Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it +with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will be *lost*. + +.. ipython:: python + :suppress: + + import os + os.remove('mi.csv') + .. _io.sniff: Automatically "sniffing" the delimiter @@ -870,6 +882,8 @@ function takes a number of arguments. Only the first is required. - ``sep`` : Field delimiter for the output file (default ",") - ``encoding``: a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 + - ``tupleize_cols``: boolean, default True, if False, write as a list of tuples, + otherwise write in an expanded line format suitable for ``read_csv`` Writing a formatted string ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -901,6 +915,9 @@ The Series object also has a ``to_string`` method, but with only the ``buf``, which, if set to ``True``, will additionally output the length of the Series. +HTML +---- + Reading HTML format ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index aed95188db26e..a724ce96a7381 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -73,6 +73,7 @@ Enhancements an index with a different frequency than the existing, or attempting to append an index with a different name than the existing - support datelike columns with a timezone as data_columns (GH2852_) + - ``fillna`` methods now raise a ``TypeError`` if the ``value`` parameter is a list or tuple. - Added module for reading and writing Stata files: pandas.io.stata (GH1512_) @@ -80,6 +81,39 @@ Enhancements ``Series`` with object dtype. See the examples section in the regular docs :ref:`Replacing via String Expression ` + - Multi-index column support for reading and writing csvs + + - The ``header`` option in ``read_csv`` now accepts a + list of the rows from which to read the index. + + - The option, ``tupleize_cols`` can now be specified in both ``to_csv`` and + ``read_csv``, to provide compatiblity for the pre 0.11.1 behavior of + writing and reading multi-index columns via a list of tuples. The default in + 0.11.1 is to write lists of tuples and *not* interpret list of tuples as a + multi-index column. + + Note: The default behavior in 0.11.1 remains unchanged, but starting with 0.12, + the default *to* write and read multi-index columns will be in the new + format. (GH3571_, GH1651_, GH3141_) + + - If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it + with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will + be *lost*. + + .. ipython:: python + + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv('mi.csv',tupleize_cols=False) + print open('mi.csv').read() + pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + + .. ipython:: python + :suppress: + + import os + os.remove('mi.csv') + See the `full release notes `__ or issue tracker on GitHub for a complete list. @@ -96,3 +130,6 @@ on GitHub for a complete list. .. _GH1512: https://github.com/pydata/pandas/issues/1512 .. _GH2285: https://github.com/pydata/pandas/issues/2285 .. _GH3631: https://github.com/pydata/pandas/issues/3631 +.. _GH3571: https://github.com/pydata/pandas/issues/3571 +.. _GH1651: https://github.com/pydata/pandas/issues/1651 +.. _GH3141: https://github.com/pydata/pandas/issues/3141 diff --git a/pandas/core/format.py b/pandas/core/format.py index e7ac540343d84..cd4364edc6662 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -775,7 +775,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None, engine=None, - multi_index_columns_compat=False): + tupleize_cols=True): self.engine = engine # remove for 0.12 @@ -804,7 +804,15 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, msg= "columns.is_unique == False not supported with engine='python'" raise NotImplementedError(msg) - self.multi_index_columns_compat=multi_index_columns_compat + self.tupleize_cols = tupleize_cols + self.has_mi_columns = isinstance(obj.columns, MultiIndex + ) and not self.tupleize_cols + + # validate mi options + if self.has_mi_columns: + if cols is not None: + raise Exception("cannot specify cols with a multi_index on the columns") + if cols is not None: if isinstance(cols,Index): cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) @@ -960,9 +968,8 @@ def _save_header(self): obj = self.obj index_label = self.index_label cols = self.cols + has_mi_columns = self.has_mi_columns header = self.header - has_mi_columns = isinstance(obj.columns, MultiIndex - ) and not self.multi_index_columns_compat encoded_labels = [] has_aliases = isinstance(header, (tuple, list, np.ndarray)) @@ -1017,15 +1024,17 @@ def _save_header(self): # write out the names for each level, then ALL of the values for each level for i in range(columns.nlevels): - # name is the first column - col_line = [ columns.names[i] ] + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + + # name is the first column + col_line.append( columns.names[i] ) - # skipp len labels-1 - if self.index and isinstance(index_label,list) and len(index_label)>1: - col_line.extend([ '' ] * (len(index_label)-1)) + if isinstance(index_label,list) and len(index_label)>1: + col_line.extend([ '' ] * (len(index_label)-1)) - for j in range(len(columns)): - col_line.append(columns.levels[i][j]) + col_line.extend(columns.get_level_values(i)) writer.writerow(col_line) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb7416b23aab4..d91d21db3ec1b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1250,7 +1250,7 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): @classmethod def from_csv(cls, path, header=0, sep=',', index_col=0, - parse_dates=True, encoding=None): + parse_dates=True, encoding=None, tupleize_cols=False): """ Read delimited file into DataFrame @@ -1266,6 +1266,9 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, is used. Different default from read_table parse_dates : boolean, default True Parse dates. Different default from read_table + tupleize_cols : boolean, default True + write multi_index columns as a list of tuples (if True) + or new (expanded format) if False) Notes ----- @@ -1280,7 +1283,7 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, from pandas.io.parsers import read_table return read_table(path, header=header, sep=sep, parse_dates=parse_dates, index_col=index_col, - encoding=encoding) + encoding=encoding,tupleize_cols=False) @classmethod def from_dta(dta, path, parse_dates=True, convert_categoricals=True, encoding=None, index_col=None): @@ -1392,7 +1395,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None, - multi_index_columns_compat=False, **kwds): + tupleize_cols=True, **kwds): """ Write DataFrame to a comma-separated values (csv) file @@ -1430,9 +1433,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, quoting : optional constant from csv module defaults to csv.QUOTE_MINIMAL chunksize : rows to write at a time - multi_index_columns_compat : boolean, default False + tupleize_cols : boolean, default True write multi_index columns as a list of tuples (if True) - or new (expanded format)m if False) + or new (expanded format) if False) """ if nanRep is not None: # pragma: no cover import warnings @@ -1450,7 +1453,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, header=header, index=index, index_label=index_label,mode=mode, chunksize=chunksize,engine=kwds.get("engine"), - multi_index_columns_compat=multi_index_columns_compat) + tupleize_cols=tupleize_cols) formatter.save() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 78a941218c1d6..8063a8d667c54 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -127,9 +127,9 @@ class DateConversionError(Exception): usecols : array-like Return a subset of the columns. Results in much faster parsing time and lower memory usage. -mangle_dup_columns: boolean, default True +mangle_dupe_cols: boolean, default True Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' -multi_index_columns_compat: boolean, default False +tupleize_cols: boolean, default False Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns) @@ -299,7 +299,7 @@ def _read(filepath_or_buffer, kwds): 'squeeze': False, 'compression': None, 'mangle_dupe_cols': True, - 'multi_index_columns_compat':False, + 'tupleize_cols':True, } @@ -387,7 +387,7 @@ def parser_f(filepath_or_buffer, encoding=None, squeeze=False, mangle_dupe_cols=True, - multi_index_columns_compat=False, + tupleize_cols=True, ): # Alias sep -> delimiter. @@ -446,6 +446,7 @@ def parser_f(filepath_or_buffer, low_memory=low_memory, buffer_lines=buffer_lines, mangle_dupe_cols=mangle_dupe_cols, + tupleize_cols=tupleize_cols, ) return _read(filepath_or_buffer, kwds) @@ -540,16 +541,6 @@ def __init__(self, f, engine='python', **kwds): if kwds.get('header', 'infer') == 'infer': kwds['header'] = 0 if kwds.get('names') is None else None - # validate header options for mi - h = kwds['header'] - if isinstance(h,(list,tuple,np.ndarray)): - if kwds.get('index_col') is None: - raise Exception("must have an index_col when have a " - "multi-index header is specified") - if kwds.get('as_recarray'): - raise Exception("cannot specify as_recarray when " - "specifying a multi-index header") - self.orig_options = kwds # miscellanea @@ -737,11 +728,31 @@ def __init__(self, kwds): self.na_values = kwds.get('na_values') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') - self.multi_index_columns_compat = kwds.get('multi_index_columns_compat',False) + self.tupleize_cols = kwds.get('tupleize_cols',True) self._date_conv = _make_date_converter(date_parser=self.date_parser, dayfirst=self.dayfirst) + # validate header options for mi + self.header = kwds.get('header') + if isinstance(self.header,(list,tuple,np.ndarray)): + if kwds.get('as_recarray'): + raise Exception("cannot specify as_recarray when " + "specifying a multi-index header") + if kwds.get('usecols'): + raise Exception("cannot specify usecols when " + "specifying a multi-index header") + if kwds.get('names'): + raise Exception("cannot specify names when " + "specifying a multi-index header") + + # validate index_col that only contains integers + if self.index_col is not None: + if not (isinstance(self.index_col,(list,tuple,np.ndarray)) and all( + [ com.is_integer(i) for i in self.index_col ]) or com.is_integer(self.index_col)): + raise Exception("index_col must only contain row numbers " + "when specifying a multi-index header") + self._name_processed = False @property @@ -774,27 +785,46 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_ # 0 is the name of the index, assuming index_col is a list of column # numbers ic = self.index_col + if ic is None: + ic = [] + if not isinstance(ic, (list,tuple,np.ndarray)): ic = [ ic ] sic = set(ic) orig_header = list(header) + + # clean the index_names index_names = header.pop(-1) - index_names = [ index_names[i] for i in ic ] - field_count = len(header[0]) + (index_names, names, + index_col) = _clean_index_names(index_names, self.index_col) + # extract the columns + field_count = len(header[0]) def extract(r): return tuple([ r[i] for i in range(field_count) if i not in sic ]) + columns = zip(*[ extract(r) for r in header ]) + names = ic + columns + + # if we find 'Unnamed' all of a single level, then our header was too long + for n in range(len(columns[0])): + if all([ 'Unnamed' in c[n] for c in columns ]): + raise Exception("Passed header=[%s] are too many rows for this " + "multi_index of columns" % ','.join([ str(x) for x in self.header ])) + + # clean the column names (if we have an index_col) + if len(ic): + col_names = [ r[0] if len(r[0]) and 'Unnamed' not in r[0] else None for r in header ] + else: + col_names = [ None ] * len(header) - names = ic + zip(*[ extract(r) for r in header ]) - col_names = [ r[0] if len(r[0]) else None for r in header ] passed_names = True return names, index_names, col_names, passed_names def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here - if not self.multi_index_columns_compat and len(columns) and not isinstance( + if not self.tupleize_cols and len(columns) and not isinstance( columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]): columns = MultiIndex.from_tuples(columns,names=col_names) return columns @@ -1008,13 +1038,8 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - if (self._reader.leading_cols == 0 and - _is_index_col(self.index_col)): - self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( - self._reader.header, self.index_names, self.col_names, passed_names) - else: - raise Exception("must have an index_col when have a multi-index " - "header is specified") + self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, passed_names) else: self.names = list(self._reader.header[0]) @@ -1248,7 +1273,6 @@ def __init__(self, f, **kwds): raise Exception("usecols not supported with engine='python'" " or multicharacter separators (yet).") - self.header = kwds['header'] self.encoding = kwds['encoding'] self.compression = kwds['compression'] self.skiprows = kwds['skiprows'] @@ -1466,14 +1490,15 @@ def _infer_columns(self): else: this_columns.append(c) - if self.mangle_dupe_cols: - counts = {} - for i, col in enumerate(this_columns): - cur_count = counts.get(col, 0) - if cur_count > 0: - this_columns[i] = '%s.%d' % (col, cur_count) - counts[col] = cur_count + 1 - + if not have_mi_columns: + if self.mangle_dupe_cols: + counts = {} + for i, col in enumerate(this_columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + this_columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 + columns.append(this_columns) self._clear_buffer() diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index b9e773a916d4c..be47f28749848 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1014,20 +1014,30 @@ def test_header_multi_index(self): # basic test with both engines for engine in ['c','python']: - df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], engine=engine) + df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False, + engine=engine) tm.assert_frame_equal(df, expected) - # must specify index_col - self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3]) + # skipping lines in the header + df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False) + tm.assert_frame_equal(df, expected) + + #### invalid options #### # no as_recarray self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], - index_col=[0,1], as_recarray=True) - - # skipping lines in the header - df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1]) - tm.assert_frame_equal(df, expected) + index_col=[0,1], as_recarray=True, tupleize_cols=False) + # names + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], names=['foo','bar'], tupleize_cols=False) + # usecols + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False) + # non-numeric index_col + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=['foo','bar'], tupleize_cols=False) + def test_pass_names_with_index(self): lines = self.data1.split('\n') no_header = '\n'.join(lines[1:]) diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index 62e9d39cd792d..46f74cbdc885d 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -252,7 +252,7 @@ cdef class TextReader: object encoding object compression object mangle_dupe_cols - object multi_index_columns_compat + object tupleize_cols set noconvert, usecols def __cinit__(self, source, @@ -306,13 +306,13 @@ cdef class TextReader: skip_footer=0, verbose=False, mangle_dupe_cols=True, - multi_index_columns_compat=False): + tupleize_cols=True): self.parser = parser_new() self.parser.chunksize = tokenize_chunksize self.mangle_dupe_cols=mangle_dupe_cols - self.multi_index_columns_compat=multi_index_columns_compat + self.tupleize_cols=tupleize_cols # For timekeeping self.clocks = [] @@ -452,6 +452,7 @@ cdef class TextReader: if isinstance(header, list) and len(header): # need to artifically skip the final line # which is still a header line + header = list(header) header.append(header[-1]+1) self.parser.header_start = header[0] @@ -611,7 +612,7 @@ cdef class TextReader: name = 'Unnamed: %d' % i count = counts.get(name, 0) - if count > 0 and self.mangle_dupe_cols: + if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: this_header.append('%s.%d' % (name, count)) else: this_header.append(name) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 101bdc76ba443..68e69768097e7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4755,13 +4755,15 @@ def test_to_csv_moar(self): def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None, dupe_col=False): - header = 0 if cnlvl: - header = range(cnlvl) - - with ensure_clean(path) as path: - df.to_csv(path,encoding='utf8',chunksize=chunksize) - recons = DataFrame.from_csv(path,header=header,parse_dates=False) + header = range(cnlvl) + with ensure_clean(path) as path: + df.to_csv(path,encoding='utf8',chunksize=chunksize,tupleize_cols=False) + recons = DataFrame.from_csv(path,header=range(cnlvl),tupleize_cols=False,parse_dates=False) + else: + with ensure_clean(path) as path: + df.to_csv(path,encoding='utf8',chunksize=chunksize) + recons = DataFrame.from_csv(path,header=0,parse_dates=False) def _to_uni(x): if not isinstance(x,unicode): @@ -4991,34 +4993,80 @@ def test_to_csv_multiindex(self): with ensure_clean(pname) as path: # GH3571, GH1651, GH3141 + def _make_frame(names=None): + if names is True: + names = ['first','second'] + return DataFrame(np.random.randint(0,10,size=(3,3)), + columns=MultiIndex.from_tuples([('bah', 'foo'), + ('bah', 'bar'), + ('ban', 'baz')], + names=names)) + # column & index are multi-index df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) - df.to_csv(path) - result = read_csv(path,header=[0,1,2,3],index_col=[0,1]) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) assert_frame_equal(df,result) # column is mi df = mkdf(5,3,r_idx_nlevels=1,c_idx_nlevels=4) - df.to_csv(path) - result = read_csv(path,header=[0,1,2,3],index_col=0) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=0,tupleize_cols=False) assert_frame_equal(df,result) # dup column names? df = mkdf(5,3,r_idx_nlevels=3,c_idx_nlevels=4) - df.to_csv(path) - result = read_csv(path,header=[0,1,2,3],index_col=[0,1]) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) result.columns = ['R2','A','B','C'] new_result = result.reset_index().set_index(['R0','R1','R2']) new_result.columns = df.columns assert_frame_equal(df,new_result) + # writing with no index + df = _make_frame() + df.to_csv(path,tupleize_cols=False,index=False) + result = read_csv(path,header=[0,1],tupleize_cols=False) + assert_frame_equal(df,result) + + # we lose the names here + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False,index=False) + result = read_csv(path,header=[0,1],tupleize_cols=False) + self.assert_(all([ x is None for x in result.columns.names ])) + result.columns.names = df.columns.names + assert_frame_equal(df,result) + + # whatsnew example + df = _make_frame() + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1],index_col=[0],tupleize_cols=False) + assert_frame_equal(df,result) + + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1],index_col=[0],tupleize_cols=False) + assert_frame_equal(df,result) + # column & index are multi-index (compatibility) df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) - df.to_csv(path,multi_index_columns_compat=True) - result = read_csv(path,header=0,index_col=[0,1],multi_index_columns_compat=True) + df.to_csv(path,tupleize_cols=True) + result = read_csv(path,header=0,index_col=[0,1],tupleize_cols=True) result.columns = df.columns assert_frame_equal(df,result) + # invalid options + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False) + + # catch invalid headers + for i in [3,4,5,6,7]: + self.assertRaises(Exception, read_csv, path, tupleize_cols=False, header=range(i), index_col=0) + self.assertRaises(Exception, read_csv, path, tupleize_cols=False, header=[0,2], index_col=0) + + # write with cols + self.assertRaises(Exception, df.to_csv, path,tupleize_cols=False,cols=['foo','bar']) + with ensure_clean(pname) as path: # empty tsframe[:0].to_csv(path) From faf4d53c58bbe430942afc3775e29192318beac7 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 18 May 2013 19:12:58 -0400 Subject: [PATCH 7/7] TST: test for tupleize_cols=True,index=False TST: better error messages on multi_index column read failure --- pandas/io/parsers.py | 4 ++-- pandas/src/parser.pyx | 10 +++++++--- pandas/tests/test_frame.py | 19 +++++++++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8063a8d667c54..61be871e62595 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -809,8 +809,8 @@ def extract(r): # if we find 'Unnamed' all of a single level, then our header was too long for n in range(len(columns[0])): if all([ 'Unnamed' in c[n] for c in columns ]): - raise Exception("Passed header=[%s] are too many rows for this " - "multi_index of columns" % ','.join([ str(x) for x in self.header ])) + raise _parser.CParserError("Passed header=[%s] are too many rows for this " + "multi_index of columns" % ','.join([ str(x) for x in self.header ])) # clean the column names (if we have an index_col) if len(ic): diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index 46f74cbdc885d..ee92e2e60960c 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -244,7 +244,7 @@ cdef class TextReader: object na_values, true_values, false_values object memory_map object as_recarray - object header, names, header_start, header_end + object header, orig_header, names, header_start, header_end object low_memory object skiprows object compact_ints, use_unsigned @@ -441,6 +441,7 @@ cdef class TextReader: # TODO: no header vs. header is not the first row self.has_mi_columns = 0 + self.orig_header = header if header is None: # sentinel value self.parser.header_start = -1 @@ -585,8 +586,11 @@ cdef class TextReader: # e.g., if header=3 and file only has 2 lines if self.parser.lines < hr + 1: - raise CParserError('Passed header=%d but only %d lines in file' - % (self.parser.header, self.parser.lines)) + msg = self.orig_header + if isinstance(msg,list): + msg = "[%s], len of %d," % (','.join([ str(m) for m in msg ]),len(msg)) + raise CParserError('Passed header=%s but only %d lines in file' + % (msg, self.parser.lines)) field_count = self.parser.line_fields[hr] start = self.parser.line_start[hr] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 68e69768097e7..fa2e8131b6916 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5037,6 +5037,13 @@ def _make_frame(names=None): result.columns.names = df.columns.names assert_frame_equal(df,result) + # tupleize_cols=True and index=False + df = _make_frame(True) + df.to_csv(path,tupleize_cols=True,index=False) + result = read_csv(path,header=0,tupleize_cols=True,index_col=None) + result.columns = df.columns + assert_frame_equal(df,result) + # whatsnew example df = _make_frame() df.to_csv(path,tupleize_cols=False) @@ -5060,6 +5067,18 @@ def _make_frame(names=None): df.to_csv(path,tupleize_cols=False) # catch invalid headers + try: + read_csv(path,tupleize_cols=False,header=range(3),index_col=0) + except (Exception), detail: + if not str(detail).startswith('Passed header=[0,1,2] are too many rows for this multi_index of columns'): + raise AssertionError("failure in read_csv header=range(3)") + + try: + read_csv(path,tupleize_cols=False,header=range(7),index_col=0) + except (Exception), detail: + if not str(detail).startswith('Passed header=[0,1,2,3,4,5,6], len of 7, but only 6 lines in file'): + raise AssertionError("failure in read_csv header=range(7)") + for i in [3,4,5,6,7]: self.assertRaises(Exception, read_csv, path, tupleize_cols=False, header=range(i), index_col=0) self.assertRaises(Exception, read_csv, path, tupleize_cols=False, header=[0,2], index_col=0)