diff --git a/RELEASE.rst b/RELEASE.rst index acb4f429e81b0..74bafd419af54 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -34,6 +34,15 @@ pandas 0.11.1 courtesy of @cpcloud. (GH3477_) - Support for reading Amazon S3 files. (GH3504_) - Added module for reading and writing Stata files: pandas.io.stata (GH1512_) + - Added support for writing in ``to_csv`` and reading in ``read_csv``, + multi-index columns. The ``header`` option in ``read_csv`` now accepts a + list of the rows from which to read the index. Added the option, + ``tupleize_cols`` to provide compatiblity for the pre 0.11.1 behavior of + writing and reading multi-index columns via a list of tuples. The default in + 0.11.1 is to write lists of tuples and *not* interpret list of tuples as a + multi-index column. + Note: The default value will change in 0.12 to make the default *to* write and + read multi-index columns in the new format. (GH3571_, GH1651_, GH3141_) **Improvements to existing features** @@ -180,6 +189,7 @@ pandas 0.11.1 .. _GH3596: https://github.com/pydata/pandas/issues/3596 .. _GH3617: https://github.com/pydata/pandas/issues/3617 .. _GH3435: https://github.com/pydata/pandas/issues/3435 +<<<<<<< HEAD .. _GH3611: https://github.com/pydata/pandas/issues/3611 .. _GH3062: https://github.com/pydata/pandas/issues/3062 .. _GH3624: https://github.com/pydata/pandas/issues/3624 @@ -187,6 +197,11 @@ pandas 0.11.1 .. _GH3601: https://github.com/pydata/pandas/issues/3601 .. _GH3631: https://github.com/pydata/pandas/issues/3631 .. _GH1512: https://github.com/pydata/pandas/issues/1512 +======= +.. _GH3571: https://github.com/pydata/pandas/issues/3571 +.. _GH1651: https://github.com/pydata/pandas/issues/1651 +.. _GH3141: https://github.com/pydata/pandas/issues/3141 +>>>>>>> DOC: updated releasenotes, v0.11.1 whatsnew, io.rst pandas 0.11.0 diff --git a/doc/source/io.rst b/doc/source/io.rst index f15f758c42b18..42ea4a2ca5d53 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -57,7 +57,10 @@ They can take a number of arguments: specified, data types will be inferred. - ``header``: row number to use as the column names, and the start of the data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly - pass ``header=0`` to be able to replace existing names. + pass ``header=0`` to be able to replace existing names. The header can be + a list of integers that specify row locations for a multi-index on the columns + E.g. [0,1,3]. Interveaning rows that are not specified will be skipped. + (E.g. 2 in this example are skipped) - ``skiprows``: A collection of numbers for rows in the file to skip. Can also be an integer to skip the first ``n`` rows - ``index_col``: column number, column name, or list of column numbers/names, @@ -112,6 +115,10 @@ They can take a number of arguments: - ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines ` - ``usecols``: a subset of columns to return, results in much faster parsing time and lower memory usage. + - ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified + as 'X.0'...'X.N', rather than 'X'...'X' + - ``tupleize_cols``: boolean, default True, if False, convert a list of tuples + to a multi-index of columns, otherwise, leave the column index as a list of tuples .. ipython:: python :suppress: @@ -762,6 +769,36 @@ column numbers to turn multiple columns into a ``MultiIndex``: df df.ix[1978] +.. _io.multi_index_columns: + +Specifying a multi-index columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By specifying list of row locations for the ``header`` argument, you +can read in a multi-index for the columns. Specifying non-consecutive +rows will skip the interveaing rows. + +.. ipython:: python + + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv('mi.csv',tupleize_cols=False) + print open('mi.csv').read() + pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + +Note: The default behavior in 0.11.1 remains unchanged (``tupleize_cols=True``), +but starting with 0.12, the default *to* write and read multi-index columns will be in the new +format (``tupleize_cols=False``) + +Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it +with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will be *lost*. + +.. ipython:: python + :suppress: + + import os + os.remove('mi.csv') + .. _io.sniff: Automatically "sniffing" the delimiter @@ -845,6 +882,8 @@ function takes a number of arguments. Only the first is required. - ``sep`` : Field delimiter for the output file (default ",") - ``encoding``: a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 + - ``tupleize_cols``: boolean, default True, if False, write as a list of tuples, + otherwise write in an expanded line format suitable for ``read_csv`` Writing a formatted string ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -876,6 +915,9 @@ The Series object also has a ``to_string`` method, but with only the ``buf``, which, if set to ``True``, will additionally output the length of the Series. +HTML +---- + Reading HTML format ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index aed95188db26e..a724ce96a7381 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -73,6 +73,7 @@ Enhancements an index with a different frequency than the existing, or attempting to append an index with a different name than the existing - support datelike columns with a timezone as data_columns (GH2852_) + - ``fillna`` methods now raise a ``TypeError`` if the ``value`` parameter is a list or tuple. - Added module for reading and writing Stata files: pandas.io.stata (GH1512_) @@ -80,6 +81,39 @@ Enhancements ``Series`` with object dtype. See the examples section in the regular docs :ref:`Replacing via String Expression ` + - Multi-index column support for reading and writing csvs + + - The ``header`` option in ``read_csv`` now accepts a + list of the rows from which to read the index. + + - The option, ``tupleize_cols`` can now be specified in both ``to_csv`` and + ``read_csv``, to provide compatiblity for the pre 0.11.1 behavior of + writing and reading multi-index columns via a list of tuples. The default in + 0.11.1 is to write lists of tuples and *not* interpret list of tuples as a + multi-index column. + + Note: The default behavior in 0.11.1 remains unchanged, but starting with 0.12, + the default *to* write and read multi-index columns will be in the new + format. (GH3571_, GH1651_, GH3141_) + + - If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it + with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will + be *lost*. + + .. ipython:: python + + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv('mi.csv',tupleize_cols=False) + print open('mi.csv').read() + pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + + .. ipython:: python + :suppress: + + import os + os.remove('mi.csv') + See the `full release notes `__ or issue tracker on GitHub for a complete list. @@ -96,3 +130,6 @@ on GitHub for a complete list. .. _GH1512: https://github.com/pydata/pandas/issues/1512 .. _GH2285: https://github.com/pydata/pandas/issues/2285 .. _GH3631: https://github.com/pydata/pandas/issues/3631 +.. _GH3571: https://github.com/pydata/pandas/issues/3571 +.. _GH1651: https://github.com/pydata/pandas/issues/1651 +.. _GH3141: https://github.com/pydata/pandas/issues/3141 diff --git a/pandas/core/format.py b/pandas/core/format.py index bea4b59bfaaa4..cd4364edc6662 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -772,9 +772,10 @@ def grouper(x): class CSVFormatter(object): def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, - cols=None, header=True, index=True, index_label=None, - mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n', chunksize=None, engine=None): + cols=None, header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None, quoting=None, + line_terminator='\n', chunksize=None, engine=None, + tupleize_cols=True): self.engine = engine # remove for 0.12 @@ -803,6 +804,15 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, msg= "columns.is_unique == False not supported with engine='python'" raise NotImplementedError(msg) + self.tupleize_cols = tupleize_cols + self.has_mi_columns = isinstance(obj.columns, MultiIndex + ) and not self.tupleize_cols + + # validate mi options + if self.has_mi_columns: + if cols is not None: + raise Exception("cannot specify cols with a multi_index on the columns") + if cols is not None: if isinstance(cols,Index): cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) @@ -958,48 +968,82 @@ def _save_header(self): obj = self.obj index_label = self.index_label cols = self.cols + has_mi_columns = self.has_mi_columns header = self.header + encoded_labels = [] has_aliases = isinstance(header, (tuple, list, np.ndarray)) - if has_aliases or self.header: - if self.index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(obj.index, MultiIndex): - index_label = [] - for i, name in enumerate(obj.index.names): - if name is None: - name = '' - index_label.append(name) + if not (has_aliases or self.header): + return + + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [''] else: - index_label = obj.index.name - if index_label is None: - index_label = [''] - else: - index_label = [index_label] - elif not isinstance(index_label, (list, tuple, np.ndarray)): - # given a string for a DF with Index - index_label = [index_label] + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] - encoded_labels = list(index_label) - else: - encoded_labels = [] + encoded_labels = list(index_label) + else: + encoded_labels = [] - if has_aliases: - if len(header) != len(cols): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(header)))) - else: - write_cols = header + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) else: - write_cols = cols - encoded_cols = list(write_cols) - - writer.writerow(encoded_labels + encoded_cols) + write_cols = header else: - encoded_cols = list(cols) - writer.writerow(encoded_cols) + write_cols = cols + + if not has_mi_columns: + encoded_labels += list(write_cols) + + else: + + if not has_mi_columns: + encoded_labels += list(cols) + + # write out the mi + if has_mi_columns: + columns = obj.columns + + # write out the names for each level, then ALL of the values for each level + for i in range(columns.nlevels): + + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + + # name is the first column + col_line.append( columns.names[i] ) + + if isinstance(index_label,list) and len(index_label)>1: + col_line.extend([ '' ] * (len(index_label)-1)) + + col_line.extend(columns.get_level_values(i)) + + writer.writerow(col_line) + + # add blanks for the columns, so that we + # have consistent seps + encoded_labels.extend([ '' ] * len(columns)) + + # write out the index label line + writer.writerow(encoded_labels) def _save(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39742557ccc56..d91d21db3ec1b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1250,7 +1250,7 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): @classmethod def from_csv(cls, path, header=0, sep=',', index_col=0, - parse_dates=True, encoding=None): + parse_dates=True, encoding=None, tupleize_cols=False): """ Read delimited file into DataFrame @@ -1266,6 +1266,9 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, is used. Different default from read_table parse_dates : boolean, default True Parse dates. Different default from read_table + tupleize_cols : boolean, default True + write multi_index columns as a list of tuples (if True) + or new (expanded format) if False) Notes ----- @@ -1280,7 +1283,7 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, from pandas.io.parsers import read_table return read_table(path, header=header, sep=sep, parse_dates=parse_dates, index_col=index_col, - encoding=encoding) + encoding=encoding,tupleize_cols=False) @classmethod def from_dta(dta, path, parse_dates=True, convert_categoricals=True, encoding=None, index_col=None): @@ -1391,7 +1394,8 @@ def to_panel(self): def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n', chunksize=None,**kwds): + line_terminator='\n', chunksize=None, + tupleize_cols=True, **kwds): """ Write DataFrame to a comma-separated values (csv) file @@ -1429,6 +1433,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, quoting : optional constant from csv module defaults to csv.QUOTE_MINIMAL chunksize : rows to write at a time + tupleize_cols : boolean, default True + write multi_index columns as a list of tuples (if True) + or new (expanded format) if False) """ if nanRep is not None: # pragma: no cover import warnings @@ -1445,7 +1452,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, float_format=float_format, cols=cols, header=header, index=index, index_label=index_label,mode=mode, - chunksize=chunksize,engine=kwds.get("engine") ) + chunksize=chunksize,engine=kwds.get("engine"), + tupleize_cols=tupleize_cols) formatter.save() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 044b25041afd9..61be871e62595 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -52,9 +52,11 @@ class DateConversionError(Exception): dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details -header : int, default 0 if names parameter not specified, otherwise None +header : int, default 0 if names parameter not specified, Row to use for the column labels of the parsed DataFrame. Specify None if - there is no header row. + there is no header row. Can be a list of integers that specify row + locations for a multi-index on the columns E.g. [0,1,3]. Interveaning + rows that are not specified (E.g. 2 in this example are skipped) skiprows : list-like or integer Row numbers to skip (0-indexed) or number of rows to skip (int) at the start of the file @@ -125,6 +127,11 @@ class DateConversionError(Exception): usecols : array-like Return a subset of the columns. Results in much faster parsing time and lower memory usage. +mangle_dupe_cols: boolean, default True + Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' +tupleize_cols: boolean, default False + Leave a list of tuples on columns as is (default is to convert to + a Multi Index on the columns) Returns ------- @@ -292,6 +299,7 @@ def _read(filepath_or_buffer, kwds): 'squeeze': False, 'compression': None, 'mangle_dupe_cols': True, + 'tupleize_cols':True, } @@ -378,7 +386,8 @@ def parser_f(filepath_or_buffer, verbose=False, encoding=None, squeeze=False, - mangle_dupe_cols=True + mangle_dupe_cols=True, + tupleize_cols=True, ): # Alias sep -> delimiter. @@ -436,7 +445,8 @@ def parser_f(filepath_or_buffer, error_bad_lines=error_bad_lines, low_memory=low_memory, buffer_lines=buffer_lines, - mangle_dupe_cols=mangle_dupe_cols + mangle_dupe_cols=mangle_dupe_cols, + tupleize_cols=tupleize_cols, ) return _read(filepath_or_buffer, kwds) @@ -677,10 +687,8 @@ def read(self, nrows=None): if self.options.get('as_recarray'): return ret - index, columns, col_dict = ret - # May alter columns / col_dict - # index, columns, col_dict = self._create_index(col_dict, columns) + index, columns, col_dict = self._create_index(ret) df = DataFrame(col_dict, columns=columns, index=index) @@ -688,8 +696,9 @@ def read(self, nrows=None): return df[df.columns[0]] return df - def _create_index(self, col_dict, columns): - pass + def _create_index(self, ret): + index, columns, col_dict = ret + return index, columns, col_dict def get_chunk(self, size=None): if size is None: @@ -709,6 +718,7 @@ def __init__(self, kwds): self.index_col = kwds.pop('index_col', None) self.index_names = None + self.col_names = None self.parse_dates = kwds.pop('parse_dates', False) self.date_parser = kwds.pop('date_parser', None) @@ -718,10 +728,31 @@ def __init__(self, kwds): self.na_values = kwds.get('na_values') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') + self.tupleize_cols = kwds.get('tupleize_cols',True) self._date_conv = _make_date_converter(date_parser=self.date_parser, dayfirst=self.dayfirst) + # validate header options for mi + self.header = kwds.get('header') + if isinstance(self.header,(list,tuple,np.ndarray)): + if kwds.get('as_recarray'): + raise Exception("cannot specify as_recarray when " + "specifying a multi-index header") + if kwds.get('usecols'): + raise Exception("cannot specify usecols when " + "specifying a multi-index header") + if kwds.get('names'): + raise Exception("cannot specify names when " + "specifying a multi-index header") + + # validate index_col that only contains integers + if self.index_col is not None: + if not (isinstance(self.index_col,(list,tuple,np.ndarray)) and all( + [ com.is_integer(i) for i in self.index_col ]) or com.is_integer(self.index_col)): + raise Exception("index_col must only contain row numbers " + "when specifying a multi-index header") + self._name_processed = False @property @@ -743,7 +774,62 @@ def _should_parse_dates(self, i): else: return (j in self.parse_dates) or (name in self.parse_dates) - def _make_index(self, data, alldata, columns): + + def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False): + """ extract and return the names, index_names, col_names + header is a list-of-lists returned from the parsers """ + if len(header) < 2: + return header[0], index_names, col_names, passed_names + + # the names are the tuples of the header that are not the index cols + # 0 is the name of the index, assuming index_col is a list of column + # numbers + ic = self.index_col + if ic is None: + ic = [] + + if not isinstance(ic, (list,tuple,np.ndarray)): + ic = [ ic ] + sic = set(ic) + + orig_header = list(header) + + # clean the index_names + index_names = header.pop(-1) + (index_names, names, + index_col) = _clean_index_names(index_names, self.index_col) + + # extract the columns + field_count = len(header[0]) + def extract(r): + return tuple([ r[i] for i in range(field_count) if i not in sic ]) + columns = zip(*[ extract(r) for r in header ]) + names = ic + columns + + # if we find 'Unnamed' all of a single level, then our header was too long + for n in range(len(columns[0])): + if all([ 'Unnamed' in c[n] for c in columns ]): + raise _parser.CParserError("Passed header=[%s] are too many rows for this " + "multi_index of columns" % ','.join([ str(x) for x in self.header ])) + + # clean the column names (if we have an index_col) + if len(ic): + col_names = [ r[0] if len(r[0]) and 'Unnamed' not in r[0] else None for r in header ] + else: + col_names = [ None ] * len(header) + + passed_names = True + + return names, index_names, col_names, passed_names + + def _maybe_make_multi_index_columns(self, columns, col_names=None): + # possibly create a column mi here + if not self.tupleize_cols and len(columns) and not isinstance( + columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]): + columns = MultiIndex.from_tuples(columns,names=col_names) + return columns + + def _make_index(self, data, alldata, columns, indexnamerow=False): if not _is_index_col(self.index_col) or len(self.index_col) == 0: index = None @@ -760,7 +846,15 @@ def _make_index(self, data, alldata, columns): index = self._get_complex_date_index(data, columns) index = self._agg_index(index, try_parse_dates=False) - return index + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index.names = indexnamerow[:coffset] + + # maybe create a mi on the columns + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + return index, columns _implicit_index = False @@ -942,7 +1036,12 @@ def __init__(self, src, **kwds): if self._reader.header is None: self.names = None else: - self.names = list(self._reader.header) + if len(self._reader.header) > 1: + # we have a multi index in the columns + self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, passed_names) + else: + self.names = list(self._reader.header[0]) if self.names is None: if self.prefix: @@ -958,12 +1057,14 @@ def __init__(self, src, **kwds): if not self._has_complex_date_col: if (self._reader.leading_cols == 0 and - _is_index_col(self.index_col)): + _is_index_col(self.index_col)): self._name_processed = True - (self.index_names, self.names, - self.index_col) = _clean_index_names(self.names, - self.index_col) + (index_names, self.names, + self.index_col) = _clean_index_names(self.names, self.index_col) + + if self.index_names is None: + self.index_names = index_names if self._reader.header is None and not passed_names: self.index_names = [None] * len(self.index_names) @@ -1049,7 +1150,10 @@ def read(self, nrows=None): data = dict((k, v) for k, (i, v) in zip(names, data)) names, data = self._do_date_conversions(names, data) - index = self._make_index(data, alldata, names) + index, names = self._make_index(data, alldata, names) + + # maybe create a mi on the columns + names = self._maybe_make_multi_index_columns(names, self.col_names) return index, names, data @@ -1061,7 +1165,7 @@ def _filter_usecols(self, names): return names def _get_index_names(self): - names = list(self._reader.header) + names = list(self._reader.header[0]) idx_names = None if self._reader.leading_cols == 0 and self.index_col is not None: @@ -1169,7 +1273,6 @@ def __init__(self, f, **kwds): raise Exception("usecols not supported with engine='python'" " or multicharacter separators (yet).") - self.header = kwds['header'] self.encoding = kwds['encoding'] self.compression = kwds['compression'] self.skiprows = kwds['skiprows'] @@ -1208,6 +1311,13 @@ def __init__(self, f, **kwds): self.data = f self.columns = self._infer_columns() + # we are processing a multi index column + if len(self.columns) > 1: + self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names) + else: + self.columns = self.columns[0] + # get popped off for index self.orig_names = list(self.columns) @@ -1215,9 +1325,11 @@ def __init__(self, f, **kwds): # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: - (self.index_names, + (index_names, self.orig_names, _) = self._get_index_name(self.columns) self._name_processed = True + if self.index_names is None: + self.index_names = index_names self._first_chunk = True def _make_reader(self, f): @@ -1321,10 +1433,7 @@ def read(self, rows=None): columns, data = self._do_date_conversions(self.columns, data) data = self._convert_data(data) - index = self._make_index(data, alldata, columns) - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - index.names = indexnamerow[:coffset] + index, columns = self._make_index(data, alldata, columns, indexnamerow) return index, columns, data @@ -1350,36 +1459,58 @@ def _infer_columns(self): names = self.names if self.header is not None: - if len(self.buf) > 0: - line = self.buf[0] - else: - line = self._next_line() + header = self.header - while self.pos <= self.header: - line = self._next_line() + # we have a mi columns, so read and extra line + if isinstance(header,(list,tuple,np.ndarray)): + have_mi_columns = True + header = list(header) + [header[-1]+1] + else: + have_mi_columns = False + header = [ header ] columns = [] - for i, c in enumerate(line): - if c == '': - columns.append('Unnamed: %d' % i) + for level, hr in enumerate(header): + + if len(self.buf) > 0: + line = self.buf[0] else: - columns.append(c) + line = self._next_line() + + while self.pos <= hr: + line = self._next_line() - if self.mangle_dupe_cols: - counts = {} - for i, col in enumerate(columns): - cur_count = counts.get(col, 0) - if cur_count > 0: - columns[i] = '%s.%d' % (col, cur_count) - counts[col] = cur_count + 1 + this_columns = [] + for i, c in enumerate(line): + if c == '': + if have_mi_columns: + this_columns.append('Unnamed: %d_level_%d' % (i,level)) + else: + this_columns.append('Unnamed: %d' % i) + else: + this_columns.append(c) + + if not have_mi_columns: + if self.mangle_dupe_cols: + counts = {} + for i, col in enumerate(this_columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + this_columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 + + columns.append(this_columns) self._clear_buffer() if names is not None: - if len(names) != len(columns): + if len(names) != len(columns[0]): raise Exception('Number of passed names did not match ' 'number of header fields in the file') - columns = names + if len(columns) > 1: + raise Exception('Cannot pass names with multi-index columns') + columns = [ names ] + else: if len(self.buf) > 0: line = self.buf[0] @@ -1389,11 +1520,11 @@ def _infer_columns(self): ncols = len(line) if not names: if self.prefix: - columns = ['X%d' % i for i in range(ncols)] + columns = [ ['X%d' % i for i in range(ncols)] ] else: - columns = range(ncols) + columns = [ range(ncols) ] else: - columns = names + columns = [ names ] return columns diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index b352b189a74b8..0c5b168ee8de5 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -179,7 +179,7 @@ def test_header_not_enough_lines(self): reader = TextReader(StringIO(data), delimiter=',', header=2, as_recarray=True) header = reader.header - expected = ['a', 'b', 'c'] + expected = [['a', 'b', 'c']] self.assertEquals(header, expected) recs = reader.read() diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 38a31c042d120..be47f28749848 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -20,6 +20,7 @@ TextFileReader, TextParser) from pandas.util.testing import (assert_almost_equal, assert_series_equal, + makeCustomDataframe as mkdf, network, ensure_clean) import pandas.util.testing as tm @@ -994,6 +995,49 @@ def test_header_not_first_line(self): expected = self.read_csv(StringIO(data2), header=0, index_col=0) tm.assert_frame_equal(df, expected) + def test_header_multi_index(self): + expected = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + + # basic test with both engines + for engine in ['c','python']: + df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False, + engine=engine) + tm.assert_frame_equal(df, expected) + + # skipping lines in the header + df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False) + tm.assert_frame_equal(df, expected) + + #### invalid options #### + + # no as_recarray + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], as_recarray=True, tupleize_cols=False) + + # names + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], names=['foo','bar'], tupleize_cols=False) + # usecols + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False) + # non-numeric index_col + self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + index_col=['foo','bar'], tupleize_cols=False) + def test_pass_names_with_index(self): lines = self.data1.split('\n') no_header = '\n'.join(lines[1:]) diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index 694a769641b0d..ee92e2e60960c 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h": char thousands int header # Boolean: 1: has header, 0: no header + int header_start # header row start + int header_end # header row end void *skipset int skip_footer @@ -230,7 +232,7 @@ cdef class TextReader: cdef: parser_t *parser object file_handle - bint factorize, na_filter, verbose, has_usecols + bint factorize, na_filter, verbose, has_usecols, has_mi_columns int parser_start list clocks char *c_encoding @@ -242,7 +244,7 @@ cdef class TextReader: object na_values, true_values, false_values object memory_map object as_recarray - object header, names + object header, orig_header, names, header_start, header_end object low_memory object skiprows object compact_ints, use_unsigned @@ -250,12 +252,15 @@ cdef class TextReader: object encoding object compression object mangle_dupe_cols + object tupleize_cols set noconvert, usecols def __cinit__(self, source, delimiter=b',', header=0, + header_start=0, + header_end=0, names=None, memory_map=False, @@ -300,12 +305,14 @@ cdef class TextReader: skiprows=None, skip_footer=0, verbose=False, - mangle_dupe_cols=True): + mangle_dupe_cols=True, + tupleize_cols=True): self.parser = parser_new() self.parser.chunksize = tokenize_chunksize self.mangle_dupe_cols=mangle_dupe_cols + self.tupleize_cols=tupleize_cols # For timekeeping self.clocks = [] @@ -433,13 +440,34 @@ cdef class TextReader: self.leading_cols = 0 # TODO: no header vs. header is not the first row + self.has_mi_columns = 0 + self.orig_header = header if header is None: # sentinel value + self.parser.header_start = -1 + self.parser.header_end = -1 self.parser.header = -1 self.parser_start = 0 + self.header = [] else: - self.parser.header = header - self.parser_start = header + 1 + if isinstance(header, list) and len(header): + # need to artifically skip the final line + # which is still a header line + header = list(header) + header.append(header[-1]+1) + + self.parser.header_start = header[0] + self.parser.header_end = header[-1] + self.parser.header = header[0] + self.parser_start = header[-1] + 1 + self.has_mi_columns = 1 + self.header = header + else: + self.parser.header_start = header + self.parser.header_end = header + self.parser.header = header + self.parser_start = header + 1 + self.header = [ header ] self.names = names self.header, self.table_width = self._get_header() @@ -534,8 +562,10 @@ cdef class TextReader: ' got %s type' % type(source)) cdef _get_header(self): + # header is now a list of lists, so field_count should use header[0] + cdef: - size_t i, start, data_line, field_count, passed_count + size_t i, start, data_line, field_count, passed_count, hr char *word object name int status @@ -544,49 +574,59 @@ cdef class TextReader: header = [] - if self.parser.header >= 0: - # Header is in the file + if self.parser.header_start >= 0: - if self.parser.lines < self.parser.header + 1: - self._tokenize_rows(self.parser.header + 2) + # Header is in the file + for level, hr in enumerate(self.header): - # e.g., if header=3 and file only has 2 lines - if self.parser.lines < self.parser.header + 1: - raise CParserError('Passed header=%d but only %d lines in file' - % (self.parser.header, self.parser.lines)) + this_header = [] - field_count = self.parser.line_fields[self.parser.header] - start = self.parser.line_start[self.parser.header] + if self.parser.lines < hr + 1: + self._tokenize_rows(hr + 2) - # TODO: Py3 vs. Py2 - counts = {} - for i in range(field_count): - word = self.parser.words[start + i] + # e.g., if header=3 and file only has 2 lines + if self.parser.lines < hr + 1: + msg = self.orig_header + if isinstance(msg,list): + msg = "[%s], len of %d," % (','.join([ str(m) for m in msg ]),len(msg)) + raise CParserError('Passed header=%s but only %d lines in file' + % (msg, self.parser.lines)) - if self.c_encoding == NULL and not PY3: - name = PyBytes_FromString(word) - else: - if self.c_encoding == NULL or self.c_encoding == b'utf-8': - name = PyUnicode_FromString(word) - else: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + field_count = self.parser.line_fields[hr] + start = self.parser.line_start[hr] - if name == '': - name = 'Unnamed: %d' % i + # TODO: Py3 vs. Py2 + counts = {} + for i in range(field_count): + word = self.parser.words[start + i] + if self.c_encoding == NULL and not PY3: + name = PyBytes_FromString(word) + else: + if self.c_encoding == NULL or self.c_encoding == b'utf-8': + name = PyUnicode_FromString(word) + else: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) + + if name == '': + if self.has_mi_columns: + name = 'Unnamed: %d_level_%d' % (i,level) + else: + name = 'Unnamed: %d' % i + + count = counts.get(name, 0) + if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: + this_header.append('%s.%d' % (name, count)) + else: + this_header.append(name) + counts[name] = count + 1 - count = counts.get(name, 0) - if count > 0 and self.mangle_dupe_cols: - header.append('%s.%d' % (name, count)) - else: - header.append(name) - counts[name] = count + 1 - - data_line = self.parser.header + 1 + data_line = hr + 1 + header.append(this_header) if self.names is not None: - header = self.names + header = [ self.names ] elif self.names is not None: # Enforce this unless usecols @@ -597,11 +637,11 @@ cdef class TextReader: if self.parser.lines < 1: self._tokenize_rows(1) - header = self.names + header = [ self.names ] data_line = 0 if self.parser.lines < 1: - field_count = len(header) + field_count = len(header[0]) else: field_count = self.parser.line_fields[data_line] else: @@ -613,7 +653,7 @@ cdef class TextReader: # Corner case, not enough lines in the file if self.parser.lines < data_line + 1: - field_count = len(header) + field_count = len(header[0]) else: # not self.has_usecols: field_count = self.parser.line_fields[data_line] @@ -622,7 +662,7 @@ cdef class TextReader: if self.names is not None: field_count = max(field_count, len(self.names)) - passed_count = len(header) + passed_count = len(header[0]) # if passed_count > field_count: # raise CParserError('Column names have %d fields, ' @@ -1038,10 +1078,10 @@ cdef class TextReader: if self.header is not None: j = i - self.leading_cols # hack for #2442 - if j == len(self.header): + if j == len(self.header[0]): return j else: - return self.header[j] + return self.header[0][j] else: return None @@ -1762,6 +1802,9 @@ def _to_structured_array(dict columns, object names): if names is None: names = ['%d' % i for i in range(len(columns))] + else: + # single line header + names = names[0] dt = np.dtype([(str(name), columns[i].dtype) for i, name in enumerate(names)]) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 09cddd07e1c1d..81fda37acbb71 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -463,7 +463,7 @@ static int end_line(parser_t *self) { /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ - if (!(self->lines <= self->header + 1) + if (!(self->lines <= self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields)) { // increment file line count self->file_lines++; @@ -498,7 +498,7 @@ static int end_line(parser_t *self) { } else { /* missing trailing delimiters */ - if ((self->lines >= self->header + 1) && fields < ex_fields) { + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { /* Might overrun the buffer when closing fields */ if (make_stream_space(self, ex_fields - fields) < 0) { diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 566e89ae5f9a7..5ba1b99a29d39 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -195,6 +195,8 @@ typedef struct parser_t { char thousands; int header; // Boolean: 1: has header, 0: no header + int header_start; // header row start + int header_end; // header row end void *skipset; int skip_footer; diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 8e48ef094c419..fa2e8131b6916 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4755,9 +4755,15 @@ def test_to_csv_moar(self): def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None, dupe_col=False): - with ensure_clean(path) as path: - df.to_csv(path,encoding='utf8',chunksize=chunksize) - recons = DataFrame.from_csv(path,parse_dates=False) + if cnlvl: + header = range(cnlvl) + with ensure_clean(path) as path: + df.to_csv(path,encoding='utf8',chunksize=chunksize,tupleize_cols=False) + recons = DataFrame.from_csv(path,header=range(cnlvl),tupleize_cols=False,parse_dates=False) + else: + with ensure_clean(path) as path: + df.to_csv(path,encoding='utf8',chunksize=chunksize) + recons = DataFrame.from_csv(path,header=0,parse_dates=False) def _to_uni(x): if not isinstance(x,unicode): @@ -4773,16 +4779,6 @@ def _to_uni(x): recons.index = ix recons = recons.iloc[:,rnlvl-1:] - if cnlvl: - def stuple_to_tuple(x): - import re - x = x.split(",") - x = map(lambda x: re.sub("[\'\"\s\(\)]","",x),x) - return x - - cols=MultiIndex.from_tuples(map(stuple_to_tuple,recons.columns)) - recons.columns = cols - type_map = dict(i='i',f='f',s='O',u='O',dt='O',p='O') if r_dtype: if r_dtype == 'u': # unicode @@ -4827,7 +4823,6 @@ def stuple_to_tuple(x): assert_frame_equal(df, recons,check_names=False,check_less_precise=True) - N = 100 chunksize=1000 @@ -4962,6 +4957,7 @@ def test_to_csv_multiindex(self): frame.index = new_index with ensure_clean(pname) as path: + frame.to_csv(path, header=False) frame.to_csv(path, cols=['A', 'B']) @@ -4973,7 +4969,7 @@ def test_to_csv_multiindex(self): self.assertEqual(frame.index.names, df.index.names) self.frame.index = old_index # needed if setUP becomes a classmethod - # try multiindex with dates + # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] @@ -4994,6 +4990,102 @@ def test_to_csv_multiindex(self): assert_almost_equal(recons.values, self.tsframe.values) self.tsframe.index = old_index # needed if setUP becomes classmethod + with ensure_clean(pname) as path: + # GH3571, GH1651, GH3141 + + def _make_frame(names=None): + if names is True: + names = ['first','second'] + return DataFrame(np.random.randint(0,10,size=(3,3)), + columns=MultiIndex.from_tuples([('bah', 'foo'), + ('bah', 'bar'), + ('ban', 'baz')], + names=names)) + + # column & index are multi-index + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + assert_frame_equal(df,result) + + # column is mi + df = mkdf(5,3,r_idx_nlevels=1,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=0,tupleize_cols=False) + assert_frame_equal(df,result) + + # dup column names? + df = mkdf(5,3,r_idx_nlevels=3,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + result.columns = ['R2','A','B','C'] + new_result = result.reset_index().set_index(['R0','R1','R2']) + new_result.columns = df.columns + assert_frame_equal(df,new_result) + + # writing with no index + df = _make_frame() + df.to_csv(path,tupleize_cols=False,index=False) + result = read_csv(path,header=[0,1],tupleize_cols=False) + assert_frame_equal(df,result) + + # we lose the names here + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False,index=False) + result = read_csv(path,header=[0,1],tupleize_cols=False) + self.assert_(all([ x is None for x in result.columns.names ])) + result.columns.names = df.columns.names + assert_frame_equal(df,result) + + # tupleize_cols=True and index=False + df = _make_frame(True) + df.to_csv(path,tupleize_cols=True,index=False) + result = read_csv(path,header=0,tupleize_cols=True,index_col=None) + result.columns = df.columns + assert_frame_equal(df,result) + + # whatsnew example + df = _make_frame() + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1],index_col=[0],tupleize_cols=False) + assert_frame_equal(df,result) + + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1],index_col=[0],tupleize_cols=False) + assert_frame_equal(df,result) + + # column & index are multi-index (compatibility) + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=True) + result = read_csv(path,header=0,index_col=[0,1],tupleize_cols=True) + result.columns = df.columns + assert_frame_equal(df,result) + + # invalid options + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False) + + # catch invalid headers + try: + read_csv(path,tupleize_cols=False,header=range(3),index_col=0) + except (Exception), detail: + if not str(detail).startswith('Passed header=[0,1,2] are too many rows for this multi_index of columns'): + raise AssertionError("failure in read_csv header=range(3)") + + try: + read_csv(path,tupleize_cols=False,header=range(7),index_col=0) + except (Exception), detail: + if not str(detail).startswith('Passed header=[0,1,2,3,4,5,6], len of 7, but only 6 lines in file'): + raise AssertionError("failure in read_csv header=range(7)") + + for i in [3,4,5,6,7]: + self.assertRaises(Exception, read_csv, path, tupleize_cols=False, header=range(i), index_col=0) + self.assertRaises(Exception, read_csv, path, tupleize_cols=False, header=[0,2], index_col=0) + + # write with cols + self.assertRaises(Exception, df.to_csv, path,tupleize_cols=False,cols=['foo','bar']) + with ensure_clean(pname) as path: # empty tsframe[:0].to_csv(path)