From ca83d5ec90402af14ddd82754efe6e6c1abbc3d8 Mon Sep 17 00:00:00 2001 From: y-p Date: Fri, 15 Mar 2013 04:33:25 +0200 Subject: [PATCH 01/27] ENH: improve performance of df.to_csv GH3054 --- pandas/core/common.py | 20 ++++++++++++++++++++ pandas/core/frame.py | 44 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index a3e8c09839891..54b6564badd03 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1594,6 +1594,26 @@ def _check_as_is(x): # empty queue self.queue.truncate(0) + def writerows(self, rows): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + for i, row in enumerate(rows): + rows[i] = [x if _check_as_is(x) + else pprint_thing(x).encode('utf-8') for x in row] + + self.writer.writerows([[s for s in row] for row in rows]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + _NS_DTYPE = np.dtype('M8[ns]') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ee586a2101f62..976b0c7a013f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1345,15 +1345,33 @@ def _helper_csv(self, writer, na_rep=None, cols=None, data_index = self.index.to_timestamp() nlevels = getattr(data_index, 'nlevels', 1) + + spaces = [None] * len(cols) + if index: + if nlevels == 1: + row_fields_f = lambda x: [x] + spaces + else: # handle MultiIndex + row_fields_f = lambda x: list(x) + spaces + else: + nlevels = 0 + row_fields_f = lambda x: [None] * len(cols) + + # In crude testing, N>100 yields little marginal improvement + N=100 + rows = [None]*N + + all_cols = False + if len(cols) < 10000: + all_cols = list(enumerate(cols)) + + j = None for j, idx in enumerate(data_index): - row_fields = [] - if index: - if nlevels == 1: - row_fields = [idx] - else: # handle MultiIndex - row_fields = list(idx) - for i, col in enumerate(cols): + row_fields = row_fields_f(idx) + + for i, col in (all_cols or enumerate(cols)): val = series[col][j] + + if lib.checknull(val): val = na_rep @@ -1362,9 +1380,17 @@ def _helper_csv(self, writer, na_rep=None, cols=None, elif isinstance(val, np.datetime64): val = lib.Timestamp(val)._repr_base - row_fields.append(val) + row_fields[i+nlevels] = val + + rows[ j % N ] = map(lambda val: np.asscalar(val) if isinstance(val,np.number) else val + ,row_fields) + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + + if j is not None and (j < N-1 or (j % N) != N-1 ): + writer.writerows(rows[:((j+1) % N)]) - writer.writerow(row_fields) def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, From d46fa22ec1ad577163f459a943a58eee2e8b183f Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 15 Mar 2013 16:17:16 -0400 Subject: [PATCH 02/27] ENH: to_csv using masking to simplify dtype processing --- pandas/core/frame.py | 34 ++++++++++++++++++++-------------- pandas/tests/test_frame.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 976b0c7a013f5..c017890d234f6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1297,8 +1297,25 @@ def _helper_csv(self, writer, na_rep=None, cols=None, series = {} for k, v in self._series.iteritems(): - series[k] = v.values - + mask = isnull(v) + imask = -mask + if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': + values = np.empty(len(v),dtype=object) + values[mask] = 'NaT' + + if v.dtype == 'datetime64[ns]': + values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object) + elif v.dtype == 'timedelta64[ns]': + values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object) + else: + values = np.array(v.values,dtype=object) + values[mask] = na_rep + if issubclass(v.dtype.type,np.floating): + if float_format: + values[imask] = np.array([ float_format % val for val in v[imask] ]) + + series[k] = values + has_aliases = isinstance(header, (tuple, list, np.ndarray)) if has_aliases or header: if index: @@ -1369,18 +1386,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, row_fields = row_fields_f(idx) for i, col in (all_cols or enumerate(cols)): - val = series[col][j] - - - if lib.checknull(val): - val = na_rep - - if float_format is not None and com.is_float(val): - val = float_format % val - elif isinstance(val, np.datetime64): - val = lib.Timestamp(val)._repr_base - - row_fields[i+nlevels] = val + row_fields[i+nlevels] = series[col][j] rows[ j % N ] = map(lambda val: np.asscalar(val) if isinstance(val,np.number) else val ,row_fields) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1c30dfd1abced..5d270bb037c7a 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4562,6 +4562,37 @@ def test_to_csv_withcommas(self): os.remove(path) + def test_to_csv_mixed(self): + filename = '__tmp_to_csv_mixed__.csv' + def create_cols(name): + return [ "%s%03d" % (name,i) for i in xrange(5) ] + + df_float = DataFrame(np.random.randn(100, 5),dtype='float64',columns=create_cols('float')) + df_int = DataFrame(np.random.randn(100, 5),dtype='int64',columns=create_cols('int')) + df_bool = DataFrame(True,index=df_float.index,columns=create_cols('bool')) + df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object')) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date')) + + # add in some nans + df_float.ix[30:50,1:3] = np.nan + + #### this is a bug in read_csv right now #### + #df_dt.ix[30:50,1:3] = np.nan + + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + # dtype + dtypes = dict() + for n,dtype in [('float',np.float64),('int',np.int64),('bool',np.bool),('object',np.object)]: + for c in create_cols(n): + dtypes[c] = dtype + + df.to_csv(filename) + + rs = pan.read_csv(filename, index_col=0, dtype=dtypes, parse_dates=create_cols('date')) + assert_frame_equal(rs, df) + os.remove(filename) + def test_to_csv_bug(self): path = '__tmp_to_csv_bug__.csv' f1 = StringIO('a,1.0\nb,2.0') From 7c6777675a27e345a99ea93603da695a8cd0df9c Mon Sep 17 00:00:00 2001 From: y-p Date: Sat, 16 Mar 2013 02:06:27 +0200 Subject: [PATCH 03/27] ENH: more perf tweaks in df.to_csv --- pandas/core/frame.py | 56 +++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c017890d234f6..d8d2f07ac0ee3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1362,37 +1362,49 @@ def _helper_csv(self, writer, na_rep=None, cols=None, data_index = self.index.to_timestamp() nlevels = getattr(data_index, 'nlevels', 1) - - spaces = [None] * len(cols) - if index: - if nlevels == 1: - row_fields_f = lambda x: [x] + spaces - else: # handle MultiIndex - row_fields_f = lambda x: list(x) + spaces - else: + if not index: nlevels = 0 - row_fields_f = lambda x: [None] * len(cols) # In crude testing, N>100 yields little marginal improvement N=100 - rows = [None]*N + + # pre-allocate rows + rows = [[None]*(nlevels+len(cols)) for x in range(N)] all_cols = False - if len(cols) < 10000: + if len(cols) < 10000: # 10000 as in "usually" all_cols = list(enumerate(cols)) j = None - for j, idx in enumerate(data_index): - row_fields = row_fields_f(idx) - - for i, col in (all_cols or enumerate(cols)): - row_fields[i+nlevels] = series[col][j] - - rows[ j % N ] = map(lambda val: np.asscalar(val) if isinstance(val,np.number) else val - ,row_fields) - - if j >= N-1 and j % N == N-1: - writer.writerows(rows) + if nlevels == 1: + for j, idx in enumerate(data_index): + row = rows[j % N] + row[0] = idx + for i, col in (all_cols or enumerate(cols)): + val = series[col][j] + row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + elif nlevels > 1: + for j, idx in enumerate(data_index): + row = rows[j % N] + row[:nlevels] = list(idx) + for i, col in (all_cols or enumerate(cols)): + val = series[col][j] + row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + else: + for j, idx in enumerate(data_index): + row = rows[j % N] + for i, col in (all_cols or enumerate(cols)): + val = series[col][j] + row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) if j is not None and (j < N-1 or (j % N) != N-1 ): writer.writerows(rows[:((j+1) % N)]) From 93496813ccfa6b635b72a307d00f296412e6eda6 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 15 Mar 2013 19:30:36 -0400 Subject: [PATCH 04/27] PERF: cythonized parts of to_csv for increased perf --- pandas/core/frame.py | 48 +------------------------------------------- pandas/lib.pyx | 45 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d8d2f07ac0ee3..896880995feb6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1362,53 +1362,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, data_index = self.index.to_timestamp() nlevels = getattr(data_index, 'nlevels', 1) - if not index: - nlevels = 0 - - # In crude testing, N>100 yields little marginal improvement - N=100 - - # pre-allocate rows - rows = [[None]*(nlevels+len(cols)) for x in range(N)] - - all_cols = False - if len(cols) < 10000: # 10000 as in "usually" - all_cols = list(enumerate(cols)) - - j = None - if nlevels == 1: - for j, idx in enumerate(data_index): - row = rows[j % N] - row[0] = idx - for i, col in (all_cols or enumerate(cols)): - val = series[col][j] - row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val - - if j >= N-1 and j % N == N-1: - writer.writerows(rows) - elif nlevels > 1: - for j, idx in enumerate(data_index): - row = rows[j % N] - row[:nlevels] = list(idx) - for i, col in (all_cols or enumerate(cols)): - val = series[col][j] - row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val - - if j >= N-1 and j % N == N-1: - writer.writerows(rows) - else: - for j, idx in enumerate(data_index): - row = rows[j % N] - for i, col in (all_cols or enumerate(cols)): - val = series[col][j] - row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val - - if j >= N-1 and j % N == N-1: - writer.writerows(rows) - - if j is not None and (j < N-1 or (j % N) != N-1 ): - writer.writerows(rows[:((j+1) % N)]) - + lib.write_csv_rows(series, list(data_index), index, nlevels, list(cols), writer) def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 1fd579553f094..051c4e74a60b9 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -784,6 +784,51 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje return arr +@cython.boundscheck(False) +@cython.wraparound(False) +def write_csv_rows(dict series, list data_index, object index, int nlevels, list cols, object writer): + + cdef int N, j, i, ncols, ndata_index + cdef list rows, row_fields, spaces + cdef object v + + ncols = len(cols) + spaces = [None] * len(cols) + if index: + if nlevels == 1: + row_fields_f = lambda x: [x] + spaces + else: # handle MultiIndex + row_fields_f = lambda x: list(x) + spaces + else: + nlevels = 0 + row_fields_f = lambda x: [None] * len(cols) + + # In crude testing, N>100 yields little marginal improvement + N=100 + rows = [None]*N + + ndata_index = len(data_index) + for j in range(ndata_index): + row_fields = row_fields_f(data_index[j]) + + for i in range(len(row_fields)): + v = row_fields[i] + if isinstance(v,np.number): + row_fields[i] = np.asscalar(v) + for i in range(ncols): + v = series[cols[i]][j] + if isinstance(v,np.number): + v = np.asscalar(v) + row_fields[i+nlevels] = v + + rows[ j % N ] = row_fields + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + + if ndata_index and (j < N-1 or (j % N) != N-1 ): + writer.writerows(rows[:((j+1) % N)]) + @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_2d(ndarray indexer0, From 10857b089ec66c7931a9336cf945ba397863f53c Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 15 Mar 2013 20:15:11 -0400 Subject: [PATCH 05/27] PERF: more cython tweaks --- pandas/lib.pyx | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 051c4e74a60b9..4bb6e2df81642 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -788,40 +788,48 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje @cython.wraparound(False) def write_csv_rows(dict series, list data_index, object index, int nlevels, list cols, object writer): - cdef int N, j, i, ncols, ndata_index - cdef list rows, row_fields, spaces - cdef object v + cdef int N, j, i, l, ncols, ndata_index + cdef list rows, spaces + cdef object v, val + cdef ndarray row_fields ncols = len(cols) - spaces = [None] * len(cols) - if index: - if nlevels == 1: - row_fields_f = lambda x: [x] + spaces - else: # handle MultiIndex - row_fields_f = lambda x: list(x) + spaces - else: - nlevels = 0 - row_fields_f = lambda x: [None] * len(cols) # In crude testing, N>100 yields little marginal improvement N=100 rows = [None]*N ndata_index = len(data_index) + + if index: + row_fields = np.empty(ncols+nlevels,dtype=object) + else: + nlevels = 0 + row_fields = np.empty(ncols,dtype=object) + for j in range(ndata_index): - row_fields = row_fields_f(data_index[j]) - for i in range(len(row_fields)): - v = row_fields[i] - if isinstance(v,np.number): - row_fields[i] = np.asscalar(v) + if index: + if nlevels == 1: + v = data_index[j] + if isinstance(v,np.number): + v = np.asscalar(v) + row_fields[0] = v + else: + val = data_index[j] + for l in range(nlevels): + v = val[l] + if isinstance(v,np.number): + v = np.asscalar(v) + row_fields[l] = v + for i in range(ncols): v = series[cols[i]][j] if isinstance(v,np.number): v = np.asscalar(v) row_fields[i+nlevels] = v - rows[ j % N ] = row_fields + rows[ j % N ] = row_fields.copy() if j >= N-1 and j % N == N-1: writer.writerows(rows) From 6d4e0bb1e959bc9f09bfc493dedbc4dff9528643 Mon Sep 17 00:00:00 2001 From: y-p Date: Sat, 16 Mar 2013 03:28:46 +0200 Subject: [PATCH 06/27] PERF: cythonize improved python version --- pandas/core/frame.py | 7 +++- pandas/lib.pyx | 88 ++++++++++++++++++++++---------------------- 2 files changed, 49 insertions(+), 46 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 896880995feb6..1c69efe487dc1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1315,7 +1315,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, values[imask] = np.array([ float_format % val for val in v[imask] ]) series[k] = values - + has_aliases = isinstance(header, (tuple, list, np.ndarray)) if has_aliases or header: if index: @@ -1362,7 +1362,10 @@ def _helper_csv(self, writer, na_rep=None, cols=None, data_index = self.index.to_timestamp() nlevels = getattr(data_index, 'nlevels', 1) - lib.write_csv_rows(series, list(data_index), index, nlevels, list(cols), writer) + if not index: + nlevels = 0 + + lib.write_csv_rows(series, list(data_index), nlevels, list(cols), writer) def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 4bb6e2df81642..3ecf08df29df6 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -786,56 +786,56 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(dict series, list data_index, object index, int nlevels, list cols, object writer): - - cdef int N, j, i, l, ncols, ndata_index - cdef list rows, spaces - cdef object v, val - cdef ndarray row_fields +def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer): - ncols = len(cols) + cdef int N, j, i + cdef list rows, all_cols + cdef object val # In crude testing, N>100 yields little marginal improvement N=100 - rows = [None]*N - - ndata_index = len(data_index) - if index: - row_fields = np.empty(ncols+nlevels,dtype=object) + # pre-allocate rows + rows = [[None]*(nlevels+len(cols)) for x in range(N)] + + all_cols = [] + if len(cols) < 10000: # 10000 as in "usually" + all_cols = list(enumerate(cols)) + + j = -1 + if nlevels == 1: + for j, idx in enumerate(data_index): + row = rows[j % N] + row[0] = idx + for i, col in (all_cols or enumerate(cols)): + val = series[col][j] + row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + elif nlevels > 1: + for j, idx in enumerate(data_index): + row = rows[j % N] + row[:nlevels] = list(idx) + for i, col in (all_cols or enumerate(cols)): + val = series[col][j] + row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) else: - nlevels = 0 - row_fields = np.empty(ncols,dtype=object) - - for j in range(ndata_index): - - if index: - if nlevels == 1: - v = data_index[j] - if isinstance(v,np.number): - v = np.asscalar(v) - row_fields[0] = v - else: - val = data_index[j] - for l in range(nlevels): - v = val[l] - if isinstance(v,np.number): - v = np.asscalar(v) - row_fields[l] = v - - for i in range(ncols): - v = series[cols[i]][j] - if isinstance(v,np.number): - v = np.asscalar(v) - row_fields[i+nlevels] = v - - rows[ j % N ] = row_fields.copy() - - if j >= N-1 and j % N == N-1: - writer.writerows(rows) - - if ndata_index and (j < N-1 or (j % N) != N-1 ): - writer.writerows(rows[:((j+1) % N)]) + for j, idx in enumerate(data_index): + row = rows[j % N] + for i, col in (all_cols or enumerate(cols)): + val = series[col][j] + row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + + if j >= 0 and (j < N-1 or (j % N) != N-1 ): + writer.writerows(rows[:((j+1) % N)]) + @cython.boundscheck(False) @cython.wraparound(False) From 7ac83ebe5e0e802b490767eb377fc1fab44a21b1 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 15 Mar 2013 19:30:36 -0400 Subject: [PATCH 07/27] PERF: cythonized parts of to_csv for increased perf --- pandas/lib.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 3ecf08df29df6..5eaa7375d23ab 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -786,6 +786,7 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje @cython.boundscheck(False) @cython.wraparound(False) + def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer): cdef int N, j, i From d78f4f6a6bb03b0f2ac658cac630456c6971a755 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 15 Mar 2013 23:15:40 -0400 Subject: [PATCH 08/27] PERF: more speedups --- pandas/core/frame.py | 2 +- pandas/lib.pyx | 36 +++++++++++++++--------------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1c69efe487dc1..5a97b25422e7c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1314,7 +1314,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, if float_format: values[imask] = np.array([ float_format % val for val in v[imask] ]) - series[k] = values + series[k] = values.tolist() has_aliases = isinstance(header, (tuple, list, np.ndarray)) if has_aliases or header: diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 5eaa7375d23ab..2bc4eccdb4275 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -789,47 +789,41 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer): - cdef int N, j, i - cdef list rows, all_cols + cdef int N, j, i, ncols + cdef list rows cdef object val # In crude testing, N>100 yields little marginal improvement N=100 # pre-allocate rows - rows = [[None]*(nlevels+len(cols)) for x in range(N)] - - all_cols = [] - if len(cols) < 10000: # 10000 as in "usually" - all_cols = list(enumerate(cols)) + ncols = len(cols) + rows = [[None]*(nlevels+ncols) for x in range(N)] j = -1 if nlevels == 1: - for j, idx in enumerate(data_index): + for j in range(len(data_index)): row = rows[j % N] - row[0] = idx - for i, col in (all_cols or enumerate(cols)): - val = series[col][j] - row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + row[0] = data_index[j] + for i in range(ncols): + row[nlevels+i] = series[cols[i]][j] if j >= N-1 and j % N == N-1: writer.writerows(rows) elif nlevels > 1: - for j, idx in enumerate(data_index): + for j in range(len(data_index)): row = rows[j % N] - row[:nlevels] = list(idx) - for i, col in (all_cols or enumerate(cols)): - val = series[col][j] - row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + row[:nlevels] = list(data_index[j]) + for i in range(ncols): + row[nlevels+i] = series[cols[i]][j] if j >= N-1 and j % N == N-1: writer.writerows(rows) else: - for j, idx in enumerate(data_index): + for j in range(len(data_index)): row = rows[j % N] - for i, col in (all_cols or enumerate(cols)): - val = series[col][j] - row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val + for i in range(ncols): + row[nlevels+i] = series[cols[i]][j] if j >= N-1 and j % N == N-1: writer.writerows(rows) From 55adfb7bfd16a85b33bbf9a87896057e688f4cbd Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 15 Mar 2013 23:52:14 -0400 Subject: [PATCH 09/27] ENH: add chunksize parameter to DataFrame.to_csv to enable constant memory usage by writing in chunks --- RELEASE.rst | 2 ++ pandas/core/frame.py | 70 ++++++++++++++++++++++++-------------- pandas/tests/test_frame.py | 16 +++++++++ 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 2eb7980458f8e..9cd2a620e6fce 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -62,6 +62,8 @@ pandas 0.11.0 strings that can be parsed with datetime.strptime - Add ``axes`` property to ``Series`` for compatibility - Add ``xs`` function to ``Series`` for compatibility + - Add ``chunksize`` parameter to ``to_csv`` to allow writing in chunks + to enable constant memory usage **API Changes** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5a97b25422e7c..6a8de3402228a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1291,31 +1291,11 @@ def to_panel(self): def _helper_csv(self, writer, na_rep=None, cols=None, header=True, index=True, - index_label=None, float_format=None): + index_label=None, float_format=None, + chunksize=None): if cols is None: cols = self.columns - series = {} - for k, v in self._series.iteritems(): - mask = isnull(v) - imask = -mask - if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': - values = np.empty(len(v),dtype=object) - values[mask] = 'NaT' - - if v.dtype == 'datetime64[ns]': - values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object) - elif v.dtype == 'timedelta64[ns]': - values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object) - else: - values = np.array(v.values,dtype=object) - values[mask] = na_rep - if issubclass(v.dtype.type,np.floating): - if float_format: - values[imask] = np.array([ float_format % val for val in v[imask] ]) - - series[k] = values.tolist() - has_aliases = isinstance(header, (tuple, list, np.ndarray)) if has_aliases or header: if index: @@ -1365,12 +1345,50 @@ def _helper_csv(self, writer, na_rep=None, cols=None, if not index: nlevels = 0 - lib.write_csv_rows(series, list(data_index), nlevels, list(cols), writer) + rows = len(data_index) + + # write in chunksize bites + if chunksize is None: + chunksize = 100000 + chunks = int(rows / chunksize)+1 + + for i in xrange(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, rows) + if start_i == end_i: + continue + + # create the data for a chunk + chunk = self.iloc[start_i:end_i] + + series = {} + for k, v in chunk.iteritems(): + mask = isnull(v) + imask = -mask + + if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': + values = np.empty(len(v),dtype=object) + values[mask] = 'NaT' + + if v.dtype == 'datetime64[ns]': + values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object) + elif v.dtype == 'timedelta64[ns]': + values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object) + else: + values = np.array(v.values,dtype=object) + values[mask] = na_rep + if issubclass(v.dtype.type,np.floating): + if float_format: + values[imask] = np.array([ float_format % val for val in v[imask] ]) + + series[k] = values.tolist() + + lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, list(cols), writer) def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n'): + line_terminator='\n', chunksize=None): """ Write DataFrame to a comma-separated values (csv) file @@ -1407,6 +1425,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, file quoting : optional constant from csv module defaults to csv.QUOTE_MINIMAL + chunksize : rows to write at a time """ if nanRep is not None: # pragma: no cover import warnings @@ -1435,7 +1454,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, self._helper_csv(csvout, na_rep=na_rep, float_format=float_format, cols=cols, header=header, index=index, - index_label=index_label) + index_label=index_label, + chunksize=chunksize) finally: if close: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5d270bb037c7a..286fb5906f63d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4593,6 +4593,22 @@ def create_cols(name): assert_frame_equal(rs, df) os.remove(filename) + def test_to_csv_chunking(self): + filename = '__tmp_to_csv_chunking__.csv' + + aa=DataFrame({'A':range(100000)}) + + aa['B'] = aa.A + 1.0 + aa['C'] = aa.A + 2.0 + aa['D'] = aa.A + 3.0 + + for chunksize in [10000,50000,100000]: + aa.to_csv(filename,chunksize=chunksize) + rs = pan.read_csv(filename,index_col=0) + assert_frame_equal(rs, aa) + + os.remove(filename) + def test_to_csv_bug(self): path = '__tmp_to_csv_bug__.csv' f1 = StringIO('a,1.0\nb,2.0') From dcc45a73b52f03c2bbe1d0a0945ab86b858426d8 Mon Sep 17 00:00:00 2001 From: y-p Date: Sat, 16 Mar 2013 10:04:56 +0200 Subject: [PATCH 10/27] CLN: move repeated cast out of loop --- pandas/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6a8de3402228a..a52609821ec2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1352,6 +1352,8 @@ def _helper_csv(self, writer, na_rep=None, cols=None, chunksize = 100000 chunks = int(rows / chunksize)+1 + cols = list(cols) + for i in xrange(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, rows) @@ -1383,7 +1385,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, series[k] = values.tolist() - lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, list(cols), writer) + lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, cols, writer) def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, From 5a7c28d80dc37531a9ee9e474472d5349c174f63 Mon Sep 17 00:00:00 2001 From: y-p Date: Sat, 16 Mar 2013 10:05:40 +0200 Subject: [PATCH 11/27] CLN: make guard more defensive --- pandas/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a52609821ec2b..7c65b105cb054 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1345,20 +1345,20 @@ def _helper_csv(self, writer, na_rep=None, cols=None, if not index: nlevels = 0 - rows = len(data_index) + nrows = len(data_index) # write in chunksize bites if chunksize is None: chunksize = 100000 - chunks = int(rows / chunksize)+1 + chunks = int(nrows / chunksize)+1 cols = list(cols) for i in xrange(chunks): start_i = i * chunksize - end_i = min((i + 1) * chunksize, rows) - if start_i == end_i: - continue + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break # create the data for a chunk chunk = self.iloc[start_i:end_i] From ba620668ef46c245dbd1e557ab7faa2d8eef3edc Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 16 Mar 2013 10:52:18 +0200 Subject: [PATCH 12/27] REF: add com._ndarray_to_native_types --- pandas/core/common.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/core/common.py b/pandas/core/common.py index 54b6564badd03..6babf24530f6f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -101,6 +101,27 @@ def _isnull_old(obj): _isnull = _isnull_new +# float format is a bit of out of place here, +# but we'd like to reuse the mask. +def _ndarray_to_native_types(v,na_rep='',float_format=None): + mask = isnull(v) + imask = -mask + + if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': + values = np.empty(len(v),dtype=object) + values[mask] = 'NaT' + + if v.dtype == 'datetime64[ns]': + values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object) + elif v.dtype == 'timedelta64[ns]': + values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object) + else: + values = np.array(v.values,dtype=object) + values[mask] = na_rep + if issubclass(v.dtype.type,np.floating): + if float_format: + values[imask] = np.array([ float_format % val for val in v[imask] ]) + return values.tolist() def _use_inf_as_null(key): '''Option change callback for null/inf behaviour From 6c6f6cfee99bef6174702ffccef9dd0c91a83978 Mon Sep 17 00:00:00 2001 From: y-p Date: Sat, 16 Mar 2013 10:54:14 +0200 Subject: [PATCH 13/27] REF: apply native type conv to ix, cols before write_csv --- pandas/core/frame.py | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7c65b105cb054..6de1e3012bef1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -24,7 +24,7 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, - _infer_dtype_from_scalar) + _infer_dtype_from_scalar, _ndarray_to_native_types) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, @@ -1352,7 +1352,10 @@ def _helper_csv(self, writer, na_rep=None, cols=None, chunksize = 100000 chunks = int(nrows / chunksize)+1 - cols = list(cols) + if isinstance(cols,np.ndarray): + cols = _ndarray_to_native_types(cols,na_rep,float_format) + else: + cols=list(cols) for i in xrange(chunks): start_i = i * chunksize @@ -1365,27 +1368,12 @@ def _helper_csv(self, writer, na_rep=None, cols=None, series = {} for k, v in chunk.iteritems(): - mask = isnull(v) - imask = -mask - - if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': - values = np.empty(len(v),dtype=object) - values[mask] = 'NaT' - - if v.dtype == 'datetime64[ns]': - values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object) - elif v.dtype == 'timedelta64[ns]': - values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object) - else: - values = np.array(v.values,dtype=object) - values[mask] = na_rep - if issubclass(v.dtype.type,np.floating): - if float_format: - values[imask] = np.array([ float_format % val for val in v[imask] ]) + series[k] = _ndarray_to_native_types(v,na_rep,float_format) - series[k] = values.tolist() + ix = _ndarray_to_native_types(data_index[start_i:end_i], + na_rep,float_format) - lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, cols, writer) + lib.write_csv_rows(series, ix, nlevels, cols, writer) def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, From 51793db20ebd0f87141ac72d1f07e0e2a9236656 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 16 Mar 2013 16:16:55 -0400 Subject: [PATCH 14/27] PERF: added frame_to_csv2 vbench, revised frame_to_csv_mixed --- vb_suite/io_bench.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py index ba386bd0e9649..dc335a4f994d5 100644 --- a/vb_suite/io_bench.py +++ b/vb_suite/io_bench.py @@ -44,17 +44,34 @@ """ frame_to_csv = Benchmark("df.to_csv('__test__.csv')", setup, start_date=datetime(2011, 1, 1)) +#---------------------------------- + +setup = common_setup + """ +df=DataFrame({'A':range(100000)}) +df['B'] = df.A + 1.0 +df['C'] = df.A + 2.0 +df['D'] = df.A + 3.0 +""" +frame_to_csv2 = Benchmark("df.to_csv('__test__.csv')", setup, + start_date=datetime(2011, 1, 1)) #---------------------------------- setup = common_setup + """ from pandas import concat, Timestamp -df_float = DataFrame(np.random.randn(1000, 30),dtype='float64') -df_int = DataFrame(np.random.randn(1000, 30),dtype='int64') -df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) -df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) -df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) +def create_cols(name): + return [ "%s%03d" % (name,i) for i in xrange(5) ] +df_float = DataFrame(np.random.randn(10000, 5),dtype='float64',columns=create_cols('float')) +df_int = DataFrame(np.random.randn(10000, 5),dtype='int64',columns=create_cols('int')) +df_bool = DataFrame(True,index=df_float.index,columns=create_cols('bool')) +df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object')) +df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date')) + +# add in some nans +df_float.ix[30:500,1:3] = np.nan + df = concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + """ frame_to_csv_mixed = Benchmark("df.to_csv('__test__.csv')", setup, start_date=datetime(2012, 6, 1)) From bb7d1da0d42b575815a4cb1b1d1cbb1055c481bb Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 17 Mar 2013 06:06:03 +0200 Subject: [PATCH 15/27] TST: test for to_csv on failing vbench duplicate column names across dtypes is a problem, and not-easy to fix, so letting test fail . --- pandas/tests/test_frame.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 286fb5906f63d..9452cc8b8c946 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4593,6 +4593,19 @@ def create_cols(name): assert_frame_equal(rs, df) os.remove(filename) + def test_to_csv_mixed_dups_cols(self): + filename = '__tmp_to_csv_mixed_dup_cols__.csv' + + df_float = DataFrame(np.random.randn(1000, 30),dtype='float64') + df_int = DataFrame(np.random.randn(1000, 30),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) + df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + #### this raises because we have duplicate column names across dtypes #### + self.assertRaises(Exception, df.to_csv, filename) + def test_to_csv_chunking(self): filename = '__tmp_to_csv_chunking__.csv' From 71cb70d28bbfe9cde99404c5ff260be6013de423 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 17 Mar 2013 06:07:43 +0200 Subject: [PATCH 16/27] ENH: refactor series from dict to list, eliminate one level of indirection --- pandas/core/frame.py | 8 +++----- pandas/lib.pyx | 8 ++++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6de1e3012bef1..20a366a3f3662 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1366,14 +1366,12 @@ def _helper_csv(self, writer, na_rep=None, cols=None, # create the data for a chunk chunk = self.iloc[start_i:end_i] - series = {} - for k, v in chunk.iteritems(): - series[k] = _ndarray_to_native_types(v,na_rep,float_format) - + data = [ _ndarray_to_native_types(v,na_rep,float_format + ) for k, v in chunk.iteritems() ] ix = _ndarray_to_native_types(data_index[start_i:end_i], na_rep,float_format) - lib.write_csv_rows(series, ix, nlevels, cols, writer) + lib.write_csv_rows(data, ix, nlevels, cols, writer) def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 2bc4eccdb4275..850cd7fb97b2d 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -787,7 +787,7 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer): +def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): cdef int N, j, i, ncols cdef list rows @@ -806,7 +806,7 @@ def write_csv_rows(dict series, list data_index, int nlevels, list cols, object row = rows[j % N] row[0] = data_index[j] for i in range(ncols): - row[nlevels+i] = series[cols[i]][j] + row[nlevels+i] = data[i][j] if j >= N-1 and j % N == N-1: writer.writerows(rows) @@ -815,7 +815,7 @@ def write_csv_rows(dict series, list data_index, int nlevels, list cols, object row = rows[j % N] row[:nlevels] = list(data_index[j]) for i in range(ncols): - row[nlevels+i] = series[cols[i]][j] + row[nlevels+i] = data[i][j] if j >= N-1 and j % N == N-1: writer.writerows(rows) @@ -823,7 +823,7 @@ def write_csv_rows(dict series, list data_index, int nlevels, list cols, object for j in range(len(data_index)): row = rows[j % N] for i in range(ncols): - row[nlevels+i] = series[cols[i]][j] + row[i] = data[i][j] if j >= N-1 and j % N == N-1: writer.writerows(rows) From 7dc5f923e1f10c94716f1040a2294d4811873d27 Mon Sep 17 00:00:00 2001 From: y-p Date: Sun, 17 Mar 2013 06:08:54 +0200 Subject: [PATCH 17/27] ENH: replace variable lookup by constant . this is getting silly. --- pandas/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 850cd7fb97b2d..e12b524dda736 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -806,7 +806,7 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr row = rows[j % N] row[0] = data_index[j] for i in range(ncols): - row[nlevels+i] = data[i][j] + row[1+i] = data[i][j] if j >= N-1 and j % N == N-1: writer.writerows(rows) From 66e38f9951923fdd36a9f7a7def85d0b1226c566 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 17 Mar 2013 11:58:20 +0200 Subject: [PATCH 18/27] ENH: make chunks process constant element count --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 20a366a3f3662..946f98ae26cf4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1349,7 +1349,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, # write in chunksize bites if chunksize is None: - chunksize = 100000 + chunksize = (100000/ (len(cols) or 1)) or 1 chunks = int(nrows / chunksize)+1 if isinstance(cols,np.ndarray): From 20d32471baa4c191fb02deb0bb54cd0e5d10edfe Mon Sep 17 00:00:00 2001 From: y-p Date: Sun, 17 Mar 2013 12:01:51 +0200 Subject: [PATCH 19/27] PERF: avoid iteritems->iloc panelty for data conversion, use blocks --- pandas/core/common.py | 5 +++-- pandas/core/frame.py | 28 ++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 6babf24530f6f..aff9001f2797c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -110,13 +110,14 @@ def _ndarray_to_native_types(v,na_rep='',float_format=None): if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': values = np.empty(len(v),dtype=object) values[mask] = 'NaT' - if v.dtype == 'datetime64[ns]': values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object) elif v.dtype == 'timedelta64[ns]': values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object) else: - values = np.array(v.values,dtype=object) + if hasattr(v,"values"): + v= v.values + values = np.array(v,dtype=object) values[mask] = na_rep if issubclass(v.dtype.type,np.floating): if float_format: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 946f98ae26cf4..23d1ad08cf9fa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1296,6 +1296,11 @@ def _helper_csv(self, writer, na_rep=None, cols=None, if cols is None: cols = self.columns + if isinstance(cols,np.ndarray): + cols = _ndarray_to_native_types(cols,na_rep,float_format) + else: + cols=list(cols) + has_aliases = isinstance(header, (tuple, list, np.ndarray)) if has_aliases or header: if index: @@ -1352,11 +1357,6 @@ def _helper_csv(self, writer, na_rep=None, cols=None, chunksize = (100000/ (len(cols) or 1)) or 1 chunks = int(nrows / chunksize)+1 - if isinstance(cols,np.ndarray): - cols = _ndarray_to_native_types(cols,na_rep,float_format) - else: - cols=list(cols) - for i in xrange(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, nrows) @@ -1364,10 +1364,22 @@ def _helper_csv(self, writer, na_rep=None, cols=None, break # create the data for a chunk - chunk = self.iloc[start_i:end_i] - data = [ _ndarray_to_native_types(v,na_rep,float_format - ) for k, v in chunk.iteritems() ] + blocks = self._data.blocks + data =[None] * sum(len(b.items) for b in blocks) + for i in range(len(blocks)): + b = blocks[i] + v = b.values + colname_map = dict((k,i) for i,k in enumerate(self.columns)) + if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': + d = blocks[i].values[:,start_i:end_i] + for j, k in enumerate(b.items): + data[colname_map[k]] = d[j] + else: + d = _ndarray_to_native_types(b.values[:,start_i:end_i], na_rep,float_format) + for j, k in enumerate(b.items): + data[colname_map[k]] = d[j] + ix = _ndarray_to_native_types(data_index[start_i:end_i], na_rep,float_format) From 67ca8ae4fe47301df85cbee9eaf8e1a6a72cefa9 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 16 Mar 2013 23:49:08 -0400 Subject: [PATCH 20/27] TST: test for to_csv on failing vbench duplicate column names across dtypes is a problem, and not-easy to fix, so letting test fail --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 23d1ad08cf9fa..e2c053f53dfa4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1364,7 +1364,6 @@ def _helper_csv(self, writer, na_rep=None, cols=None, break # create the data for a chunk - blocks = self._data.blocks data =[None] * sum(len(b.items) for b in blocks) for i in range(len(blocks)): From 099520871736ad741904d9f65c255831e526e9db Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 17 Mar 2013 18:47:20 +0200 Subject: [PATCH 21/27] CLN: csv refactor --- pandas/core/common.py | 23 ------ pandas/core/format.py | 161 ++++++++++++++++++++++++++++++++++++++- pandas/core/frame.py | 134 +++----------------------------- pandas/core/index.py | 44 ++++++++--- pandas/core/internals.py | 47 +++++++++++- 5 files changed, 250 insertions(+), 159 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index aff9001f2797c..207ed2edac4bc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -101,29 +101,6 @@ def _isnull_old(obj): _isnull = _isnull_new -# float format is a bit of out of place here, -# but we'd like to reuse the mask. -def _ndarray_to_native_types(v,na_rep='',float_format=None): - mask = isnull(v) - imask = -mask - - if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': - values = np.empty(len(v),dtype=object) - values[mask] = 'NaT' - if v.dtype == 'datetime64[ns]': - values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object) - elif v.dtype == 'timedelta64[ns]': - values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object) - else: - if hasattr(v,"values"): - v= v.values - values = np.array(v,dtype=object) - values[mask] = na_rep - if issubclass(v.dtype.type,np.floating): - if float_format: - values[imask] = np.array([ float_format % val for val in v[imask] ]) - return values.tolist() - def _use_inf_as_null(key): '''Option change callback for null/inf behaviour Choose which replacement for numpy.isnan / -numpy.isfinite is used. diff --git a/pandas/core/format.py b/pandas/core/format.py index 003b1fefd01f7..644c08b6b0e54 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -9,7 +9,7 @@ from io import StringIO from pandas.core.common import adjoin, isnull, notnull -from pandas.core.index import MultiIndex, _ensure_index +from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.util import py3compat from pandas.core.config import get_option, set_option, reset_option import pandas.core.common as com @@ -18,6 +18,7 @@ import numpy as np import itertools +import csv from pandas.tseries.period import PeriodIndex @@ -763,6 +764,164 @@ def grouper(x): return result +class CSVFormatter(object): + + def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, + cols=None, header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None, quoting=None, + line_terminator='\n', chunksize=None): + + self.obj = obj + self.path_or_buf = path_or_buf + self.sep = sep + self.na_rep = na_rep + self.float_format = float_format + + self.header = header + self.index = index + self.index_label = index_label + self.mode = mode + self.encoding = encoding + + if quoting is None: + quoting = csv.QUOTE_MINIMAL + self.quoting = quoting + + self.line_terminator = line_terminator + + if cols is None: + cols = obj.columns + + if isinstance(cols,Index): + cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) + else: + cols=list(cols) + self.cols = cols + self.colname_map = dict((k,i) for i,k in enumerate(obj.columns)) + + if chunksize is None: + chunksize = (100000/ (len(self.cols) or 1)) or 1 + self.chunksize = chunksize + + self.data_index = obj.index + if isinstance(obj.index, PeriodIndex): + self.data_index = obj.index.to_timestamp() + + self.nlevels = getattr(self.data_index, 'nlevels', 1) + if not index: + self.nlevels = 0 + + def save(self): + + # create the writer & save + if hasattr(self.path_or_buf, 'read'): + f = self.path_or_buf + close = False + else: + f = com._get_handle(self.path_or_buf, self.mode, encoding=self.encoding) + close = True + + try: + if self.encoding is not None: + self.writer = com.UnicodeWriter(f, lineterminator=self.line_terminator, + delimiter=self.sep, encoding=self.encoding, + quoting=self.quoting) + else: + self.writer = csv.writer(f, lineterminator=self.line_terminator, + delimiter=self.sep, quoting=self.quoting) + + self._save() + + finally: + if close: + f.close() + + def _save_header(self): + + writer = self.writer + obj = self.obj + index_label = self.index_label + cols = self.cols + header = self.header + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or self.header: + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) + + writer.writerow(encoded_labels + encoded_cols) + else: + encoded_cols = list(cols) + writer.writerow(encoded_cols) + + def _save(self): + + self._save_header() + + nrows = len(self.data_index) + + # write in chunksize bites + chunksize = self.chunksize + chunks = int(nrows / chunksize)+1 + + for i in xrange(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + self._save_chunk(start_i, end_i) + + def _save_chunk(self, start_i, end_i): + + colname_map = self.colname_map + data_index = self.data_index + + # create the data for a chunk + blocks = self.obj._data.blocks + data =[None] * sum(len(b.items) for b in blocks) + slicer = slice(start_i,end_i) + for i in range(len(blocks)): + b = blocks[i] + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + for j, k in enumerate(b.items): + data[colname_map[k]] = d[j] + + ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + + lib.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) + # from collections import namedtuple # ExcelCell = namedtuple("ExcelCell", # 'row, col, val, style, mergestart, mergeend') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e2c053f53dfa4..b2dc6715b0638 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,7 +14,6 @@ from itertools import izip from StringIO import StringIO -import csv import operator import sys @@ -24,7 +23,7 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, - _infer_dtype_from_scalar, _ndarray_to_native_types) + _infer_dtype_from_scalar) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, @@ -1289,101 +1288,6 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def _helper_csv(self, writer, na_rep=None, cols=None, - header=True, index=True, - index_label=None, float_format=None, - chunksize=None): - if cols is None: - cols = self.columns - - if isinstance(cols,np.ndarray): - cols = _ndarray_to_native_types(cols,na_rep,float_format) - else: - cols=list(cols) - - has_aliases = isinstance(header, (tuple, list, np.ndarray)) - if has_aliases or header: - if index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(self.index, MultiIndex): - index_label = [] - for i, name in enumerate(self.index.names): - if name is None: - name = '' - index_label.append(name) - else: - index_label = self.index.name - if index_label is None: - index_label = [''] - else: - index_label = [index_label] - elif not isinstance(index_label, (list, tuple, np.ndarray)): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] - - if has_aliases: - if len(header) != len(cols): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(header)))) - else: - write_cols = header - else: - write_cols = cols - encoded_cols = list(write_cols) - - writer.writerow(encoded_labels + encoded_cols) - else: - encoded_cols = list(cols) - writer.writerow(encoded_cols) - - data_index = self.index - if isinstance(self.index, PeriodIndex): - data_index = self.index.to_timestamp() - - nlevels = getattr(data_index, 'nlevels', 1) - if not index: - nlevels = 0 - - nrows = len(data_index) - - # write in chunksize bites - if chunksize is None: - chunksize = (100000/ (len(cols) or 1)) or 1 - chunks = int(nrows / chunksize)+1 - - for i in xrange(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, nrows) - if start_i >= end_i: - break - - # create the data for a chunk - blocks = self._data.blocks - data =[None] * sum(len(b.items) for b in blocks) - for i in range(len(blocks)): - b = blocks[i] - v = b.values - colname_map = dict((k,i) for i,k in enumerate(self.columns)) - if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]': - d = blocks[i].values[:,start_i:end_i] - for j, k in enumerate(b.items): - data[colname_map[k]] = d[j] - else: - d = _ndarray_to_native_types(b.values[:,start_i:end_i], na_rep,float_format) - for j, k in enumerate(b.items): - data[colname_map[k]] = d[j] - - ix = _ndarray_to_native_types(data_index[start_i:end_i], - na_rep,float_format) - - lib.write_csv_rows(data, ix, nlevels, cols, writer) - def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, @@ -1432,33 +1336,15 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, FutureWarning) na_rep = nanRep - if hasattr(path_or_buf, 'read'): - f = path_or_buf - close = False - else: - f = com._get_handle(path_or_buf, mode, encoding=encoding) - close = True - - if quoting is None: - quoting = csv.QUOTE_MINIMAL - - try: - if encoding is not None: - csvout = com.UnicodeWriter(f, lineterminator=line_terminator, - delimiter=sep, encoding=encoding, - quoting=quoting) - else: - csvout = csv.writer(f, lineterminator=line_terminator, - delimiter=sep, quoting=quoting) - self._helper_csv(csvout, na_rep=na_rep, - float_format=float_format, cols=cols, - header=header, index=index, - index_label=index_label, - chunksize=chunksize) - - finally: - if close: - f.close() + formatter = fmt.CSVFormatter(self, path_or_buf, + line_terminator=line_terminator, + sep=sep, encoding=encoding, + quoting=quoting,na_rep=na_rep, + float_format=float_format, cols=cols, + header=header, index=index, + index_label=index_label, + chunksize=chunksize) + formatter.save() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', float_format=None, cols=None, header=True, index=True, diff --git a/pandas/core/index.py b/pandas/core/index.py index 0f9776e202c00..95e6c40a9dad8 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -441,16 +441,7 @@ def format(self, name=False, formatter=None, na_rep='NaN'): return header + list(self.map(formatter)) if self.is_all_dates: - zero_time = time(0, 0) - result = [] - for dt in self: - if isnull(dt): - result.append(u'NaT') - else: - if dt.time() != zero_time or dt.tzinfo is not None: - return header + [u'%s' % x for x in self] - result.append(u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day)) - return header + result + return header + _date_formatter(self) values = self.values @@ -472,6 +463,20 @@ def format(self, name=False, formatter=None, na_rep='NaN'): result = _trim_front(format_array(values, None, justify='left')) return header + result + def to_native_types(self, slicer=None, na_rep='', float_format=None): + values = self + if slicer is not None: + values = values[slicer] + mask = isnull(values) + values = np.array(values,dtype=object) + + if self.is_all_dates: + return _date_formatter(self) + else: + values[mask] = na_rep + + return values.tolist() + def equals(self, other): """ Determines if two Index objects contain the same elements. @@ -1481,6 +1486,9 @@ def __repr__(self): def __len__(self): return len(self.labels[0]) + def to_native_types(self, slicer=None, na_rep='', float_format=None): + return self.tolist() + @property def _constructor(self): return MultiIndex.from_tuples @@ -2578,6 +2586,22 @@ def _wrap_joined_index(self, joined, other): # For utility purposes +def _date_formatter(obj, na_rep=u'NaT'): + data = list(obj) + + # tz formatter or time formatter + zero_time = time(0, 0) + for d in data: + if d.time() != zero_time or d.tzinfo is not None: + return [u'%s' % x for x in data ] + + values = np.array(data,dtype=object) + mask = isnull(obj.values) + values[mask] = na_rep + + imask = -mask + values[imask] = np.array([ u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day) for dt in values[imask] ]) + return values.tolist() def _sparsify(label_list, start=0): pivoted = zip(*label_list) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2a41bbffa3b83..3467b72541481 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4,13 +4,14 @@ from numpy import nan import numpy as np -from pandas.core.common import _possibly_downcast_to_dtype +from pandas.core.common import isnull, _possibly_downcast_to_dtype from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib +from pandas.tslib import Timestamp from pandas.util import py3compat @@ -259,6 +260,17 @@ def _try_cast_result(self, result): we may have roundtripped thru object in the mean-time """ return result + def to_native_types(self, slicer=None, na_rep='', **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:,slicer] + values = np.array(values,dtype=object) + mask = isnull(values) + values[mask] = na_rep + return values.tolist() + def replace(self, to_replace, value, inplace=False): new_values = self.values if inplace else self.values.copy() if self._can_hold_element(value): @@ -577,6 +589,20 @@ def _try_cast(self, element): except: # pragma: no cover return element + def to_native_types(self, slicer=None, na_rep='', float_format=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:,slicer] + values = np.array(values,dtype=object) + mask = isnull(values) + values[mask] = na_rep + if float_format: + imask = (-mask).ravel() + values.flat[imask] = np.array([ float_format % val for val in values.ravel()[imask] ]) + return values.tolist() + def should_store(self, value): # when inserting a column should not coerce integers to floats # unnecessarily @@ -701,6 +727,25 @@ def _try_cast(self, element): except: return element + def to_native_types(self, slicer=None, na_rep=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:,slicer] + mask = isnull(values) + + rvalues = np.empty(self.shape,dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (-mask).ravel() + if self.dtype == 'datetime64[ns]': + rvalues.flat[imask] = np.array([ Timestamp(val)._repr_base for val in values.ravel()[imask] ],dtype=object) + elif self.dtype == 'timedelta64[ns]': + rvalues.flat[imask] = np.array([ lib.repr_timedelta64(val) for val in values.ravel()[imask] ],dtype=object) + return rvalues.tolist() + def should_store(self, value): return issubclass(value.dtype.type, np.datetime64) From 77761288ad4d04f40826252b8556e2f360cda5bc Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 18 Mar 2013 18:18:01 +0200 Subject: [PATCH 22/27] ENH: add (undocumented) legacy kwd to df.to_csv, just in case --- pandas/core/format.py | 96 +++++++++++++++++++++++++++++++++++++++++-- pandas/core/frame.py | 22 +++++----- 2 files changed, 104 insertions(+), 14 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 644c08b6b0e54..2237160efd941 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -769,8 +769,9 @@ class CSVFormatter(object): def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n', chunksize=None): + line_terminator='\n', chunksize=None,legacy=False): + self.legacy=legacy # remove for 0.12 self.obj = obj self.path_or_buf = path_or_buf self.sep = sep @@ -811,8 +812,86 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, if not index: self.nlevels = 0 - def save(self): + # legacy to be removed in 0.12 + def _helper_csv(self, writer, na_rep=None, cols=None, + header=True, index=True, + index_label=None, float_format=None): + if cols is None: + cols = self.columns + + series = {} + for k, v in self.obj._series.iteritems(): + series[k] = v.values + + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: + if index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(self.obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(self.obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = self.obj.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) + + writer.writerow(encoded_labels + encoded_cols) + else: + encoded_cols = list(cols) + writer.writerow(encoded_cols) + + data_index = self.obj.index + if isinstance(self.obj.index, PeriodIndex): + data_index = self.obj.index.to_timestamp() + + nlevels = getattr(data_index, 'nlevels', 1) + for j, idx in enumerate(data_index): + row_fields = [] + if index: + if nlevels == 1: + row_fields = [idx] + else: # handle MultiIndex + row_fields = list(idx) + for i, col in enumerate(cols): + val = series[col][j] + if lib.checknull(val): + val = na_rep + + if float_format is not None and com.is_float(val): + val = float_format % val + elif isinstance(val, np.datetime64): + val = lib.Timestamp(val)._repr_base + + row_fields.append(val) + + writer.writerow(row_fields) + + def save(self): # create the writer & save if hasattr(self.path_or_buf, 'read'): f = self.path_or_buf @@ -829,8 +908,17 @@ def save(self): else: self.writer = csv.writer(f, lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting) - - self._save() + + + if self.legacy: + # to be removed in 0.12 + self._helper_csv(self.writer, na_rep=self.na_rep, + float_format=self.float_format, cols=self.cols, + header=self.header, index=self.index, + index_label=self.index_label) + + else: + self._save() finally: if close: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2dc6715b0638..7cfb9ec03ba83 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1291,7 +1291,7 @@ def to_panel(self): def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n', chunksize=None): + line_terminator='\n', chunksize=None,**kwds): """ Write DataFrame to a comma-separated values (csv) file @@ -1336,15 +1336,17 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, FutureWarning) na_rep = nanRep - formatter = fmt.CSVFormatter(self, path_or_buf, - line_terminator=line_terminator, - sep=sep, encoding=encoding, - quoting=quoting,na_rep=na_rep, - float_format=float_format, cols=cols, - header=header, index=index, - index_label=index_label, - chunksize=chunksize) - formatter.save() + + else: + formatter = fmt.CSVFormatter(self, path_or_buf, + line_terminator=line_terminator, + sep=sep, encoding=encoding, + quoting=quoting,na_rep=na_rep, + float_format=float_format, cols=cols, + header=header, index=index, + index_label=index_label, + chunksize=chunksize,legacy=kwds.get("legacy",False) ) + formatter.save() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', float_format=None, cols=None, header=True, index=True, From 0e42e46d92da245736ffcba32c15d91b7d7d786e Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 18 Mar 2013 13:38:46 -0400 Subject: [PATCH 23/27] TST: fail early on duplicate columns --- pandas/core/format.py | 8 ++++++-- pandas/tests/test_frame.py | 7 +++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 2237160efd941..59e34709caecb 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -770,7 +770,6 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None,legacy=False): - self.legacy=legacy # remove for 0.12 self.obj = obj self.path_or_buf = path_or_buf @@ -798,6 +797,11 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, else: cols=list(cols) self.cols = cols + + # fail early if we have duplicate columns + if len(set(self.cols)) != len(self.cols): + raise Exception("duplicate columns are not permitted in to_csv") + self.colname_map = dict((k,i) for i,k in enumerate(obj.columns)) if chunksize is None: @@ -909,7 +913,6 @@ def save(self): self.writer = csv.writer(f, lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting) - if self.legacy: # to be removed in 0.12 self._helper_csv(self.writer, na_rep=self.na_rep, @@ -920,6 +923,7 @@ def save(self): else: self._save() + finally: if close: f.close() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 9452cc8b8c946..aeda07e558d7d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4593,8 +4593,11 @@ def create_cols(name): assert_frame_equal(rs, df) os.remove(filename) - def test_to_csv_mixed_dups_cols(self): - filename = '__tmp_to_csv_mixed_dup_cols__.csv' + def test_to_csv_dups_cols(self): + filename = '__tmp_to_csv_dup_cols__.csv' + + df = DataFrame(np.random.randn(1000, 30),columns=range(15)+range(15),dtype='float64') + self.assertRaises(Exception, df.to_csv, filename) df_float = DataFrame(np.random.randn(1000, 30),dtype='float64') df_int = DataFrame(np.random.randn(1000, 30),dtype='int64') From 616347c98a1b3d0cd04d09f57f84ffb171323605 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 19 Mar 2013 09:15:05 +0200 Subject: [PATCH 24/27] CLN: preallocate data array only once --- pandas/core/format.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 59e34709caecb..ef14c830e1c37 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -798,6 +798,11 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, cols=list(cols) self.cols = cols + # preallocate data 2d list + self.blocks = self.obj._data.blocks + ncols = sum(len(b.items) for b in self.blocks) + self.data =[None] * ncols + # fail early if we have duplicate columns if len(set(self.cols)) != len(self.cols): raise Exception("duplicate columns are not permitted in to_csv") @@ -1001,18 +1006,17 @@ def _save_chunk(self, start_i, end_i): data_index = self.data_index # create the data for a chunk - blocks = self.obj._data.blocks - data =[None] * sum(len(b.items) for b in blocks) slicer = slice(start_i,end_i) - for i in range(len(blocks)): - b = blocks[i] + for i in range(len(self.blocks)): + b = self.blocks[i] d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) for j, k in enumerate(b.items): - data[colname_map[k]] = d[j] + # self.data is a preallocated list + self.data[colname_map[k]] = d[j] ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) - lib.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) + lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) # from collections import namedtuple # ExcelCell = namedtuple("ExcelCell", From 87a391ecfba20fd181e3bb883026cf4f213be6e1 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 19 Mar 2013 07:28:41 +0200 Subject: [PATCH 25/27] TST: test the hell out of the new df.to_csv() --- pandas/tests/test_frame.py | 109 +++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index aeda07e558d7d..7051c193dffd4 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4450,6 +4450,115 @@ def test_to_csv_from_csv(self): os.remove(path) + def test_to_csv_moar(self): + from pandas.util.testing import makeCustomDataframe as mkdf + path = '__tmp_to_csv_dupe_cols__' + def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None): + try: + df.to_csv(path,encoding='utf8') + recons = DataFrame.from_csv(path) + except: + os.remove(path) + raise + else: + def _to_uni(x): + if not isinstance(x,unicode): + return x.decode('utf8') + return x + if rnlvl: + delta_lvl = [recons.icol(i).values for i in range(rnlvl-1)] + ix=MultiIndex.from_arrays([list(recons.index)]+delta_lvl) + recons.index = ix + recons = recons.iloc[:,rnlvl-1:] + + if cnlvl: + def stuple_to_tuple(x): + import re + x = x.split(",") + x = map(lambda x: re.sub("[\'\"\s\(\)]","",x),x) + return x + + cols=MultiIndex.from_tuples(map(stuple_to_tuple,recons.columns)) + recons.columns = cols + + type_map = dict(i='i',f='f',s='O',u='O',dt='O') + if r_dtype: + if r_dtype == 'u': # unicode + r_dtype='O' + recons.index = np.array(map(_to_uni,recons.index), + dtype=r_dtype ) + df.index = np.array(map(_to_uni,df.index),dtype=r_dtype ) + if r_dtype == 'dt': # unicode + r_dtype='O' + recons.index = np.array(map(Timestamp,recons.index), + dtype=r_dtype ) + df.index = np.array(map(Timestamp,df.index),dtype=r_dtype ) + else: + r_dtype= type_map.get(r_dtype) + recons.index = np.array(recons.index,dtype=r_dtype ) + df.index = np.array(df.index,dtype=r_dtype ) + if c_dtype: + if c_dtype == 'u': + c_dtype='O' + recons.columns = np.array(map(_to_uni,recons.columns), + dtype=c_dtype ) + df.Columns = np.array(map(_to_uni,df.columns),dtype=c_dtype ) + elif c_dtype == 'dt': + c_dtype='O' + recons.columns = np.array(map(Timestamp,recons.columns), + dtype=c_dtype ) + df.Columns = np.array(map(Timestamp,df.columns),dtype=c_dtype ) + else: + c_dtype= type_map.get(c_dtype) + recons.columns = np.array(recons.columns,dtype=c_dtype ) + df.columns = np.array(df.columns,dtype=c_dtype ) + + assert_frame_equal(df, recons,check_names=False) + + N = 100 + + for ncols in [1,10,30]: + base = int((100000/ ncols or 1) or 1) + for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + print( nrows,ncols) + _do_test(mkdf(nrows, ncols),path) + + for nrows in [10,N-2,N-1,N,N+1,N+2]: + df = mkdf(nrows, 10) + cols = list(df.columns) + cols[:1] = ["dupe","dupe"] + cols[-1:] = ["dupe","dupe"] + ix = list(df.index) + ix[:2] = ["rdupe","rdupe"] + ix[-2:] = ["rdupe","rdupe"] + print( nrows) + + df.index=ix + _do_test(df,path) + + for r_idx_type in ['i', 'f','s','u','dt']: + for c_idx_type in ['i', 'f','s','u','dt']: + print(r_idx_type,c_idx_type) + _do_test(mkdf(100, 1,r_idx_type=r_idx_type, + c_idx_type=c_idx_type),path,r_idx_type,c_idx_type) + _do_test(mkdf(100, 2,r_idx_type=r_idx_type, + c_idx_type=c_idx_type),path,r_idx_type,c_idx_type) + + _do_test(DataFrame(index=range(10)),path) + _do_test(mkdf(50001, 2,r_idx_nlevels=2),path,rnlvl=2) + for ncols in [2,10,30]: + base = int(100000/ncols) + for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + print(nrows, ncols) + _do_test(mkdf(nrows, ncols,r_idx_nlevels=2),path,rnlvl=2) + _do_test(mkdf(nrows, ncols,c_idx_nlevels=2),path,cnlvl=2) + _do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2), + path,rnlvl=2,cnlvl=2) + + + def test_to_csv_from_csv_w_some_infs(self): path = '__%s__' % tm.rands(10) From 22f258ffaf33b3988845839b06a728dd10475550 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 19 Mar 2013 09:27:45 +0200 Subject: [PATCH 26/27] BUG: MultiIndex to_native_types did not obey slicer --- pandas/core/index.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 95e6c40a9dad8..8b42f2146a7cf 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1487,7 +1487,10 @@ def __len__(self): return len(self.labels[0]) def to_native_types(self, slicer=None, na_rep='', float_format=None): - return self.tolist() + ix = self + if slicer: + ix = self[slicer] + return ix.tolist() @property def _constructor(self): From 4d9a3d357bfc986ee1e86f74623f606be1afacdd Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 19 Mar 2013 09:41:31 +0200 Subject: [PATCH 27/27] DOC: update what's new, RELEASE.rst --- RELEASE.rst | 2 ++ doc/source/v0.11.0.txt | 3 +++ 2 files changed, 5 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 9cd2a620e6fce..51fdd527afdfa 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -47,6 +47,7 @@ pandas 0.11.0 **Improvements to existing features** + - Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_) - added ``blocks`` attribute to DataFrames, to return a dict of dtypes to homogeneously dtyped DataFrames - added keyword ``convert_numeric`` to ``convert_objects()`` to try to @@ -185,6 +186,7 @@ pandas 0.11.0 .. _GH3012: https://github.com/pydata/pandas/issues/3012 .. _GH3029: https://github.com/pydata/pandas/issues/3029 .. _GH3041: https://github.com/pydata/pandas/issues/3041 +.. _GH3059: https://github.com/pydata/pandas/issues/3039 pandas 0.10.1 diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 60ec7de5c4d8e..09289bab5a0f4 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -229,6 +229,8 @@ API changes Enhancements ~~~~~~~~~~~~ + - Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_) + - Numexpr is now a :ref:`Recommended Dependencies `, to accelerate certain types of numerical and boolean operations @@ -331,3 +333,4 @@ on GitHub for a complete list. .. _GH2806: https://github.com/pydata/pandas/issues/2806 .. _GH2807: https://github.com/pydata/pandas/issues/2807 .. _GH2918: https://github.com/pydata/pandas/issues/2918 +.. _GH3059: https://github.com/pydata/pandas/issues/3059