From ca83d5ec90402af14ddd82754efe6e6c1abbc3d8 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Fri, 15 Mar 2013 04:33:25 +0200
Subject: [PATCH 01/27] ENH: improve performance of df.to_csv GH3054

---
 pandas/core/common.py | 20 ++++++++++++++++++++
 pandas/core/frame.py  | 44 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/pandas/core/common.py b/pandas/core/common.py
index a3e8c09839891..54b6564badd03 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -1594,6 +1594,26 @@ def _check_as_is(x):
             # empty queue
             self.queue.truncate(0)
 
+        def writerows(self, rows):
+            def _check_as_is(x):
+                return (self.quoting == csv.QUOTE_NONNUMERIC and
+                        is_number(x)) or isinstance(x, str)
+
+            for i, row in enumerate(rows):
+                rows[i] = [x if _check_as_is(x)
+                           else pprint_thing(x).encode('utf-8') for x in row]
+
+            self.writer.writerows([[s for s in row] for row in rows])
+            # Fetch UTF-8 output from the queue ...
+            data = self.queue.getvalue()
+            data = data.decode("utf-8")
+            # ... and reencode it into the target encoding
+            data = self.encoder.encode(data)
+            # write to the target stream
+            self.stream.write(data)
+            # empty queue
+            self.queue.truncate(0)
+
 
 _NS_DTYPE = np.dtype('M8[ns]')
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ee586a2101f62..976b0c7a013f5 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1345,15 +1345,33 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             data_index = self.index.to_timestamp()
 
         nlevels = getattr(data_index, 'nlevels', 1)
+
+        spaces = [None] *  len(cols)
+        if index:
+            if nlevels == 1:
+                row_fields_f = lambda x: [x] + spaces
+            else:  # handle MultiIndex
+                row_fields_f = lambda x: list(x) + spaces
+        else:
+            nlevels = 0
+            row_fields_f = lambda x: [None] *  len(cols)
+
+        # In crude testing, N>100 yields little marginal improvement
+        N=100
+        rows = [None]*N
+
+        all_cols = False
+        if len(cols) < 10000:
+            all_cols = list(enumerate(cols))
+
+        j = None
         for j, idx in enumerate(data_index):
-            row_fields = []
-            if index:
-                if nlevels == 1:
-                    row_fields = [idx]
-                else:  # handle MultiIndex
-                    row_fields = list(idx)
-            for i, col in enumerate(cols):
+            row_fields = row_fields_f(idx)
+
+            for i, col in (all_cols or enumerate(cols)):
                 val = series[col][j]
+
+
                 if lib.checknull(val):
                     val = na_rep
 
@@ -1362,9 +1380,17 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
                 elif isinstance(val, np.datetime64):
                     val = lib.Timestamp(val)._repr_base
 
-                row_fields.append(val)
+                row_fields[i+nlevels] = val
+
+            rows[ j % N ] = map(lambda val: np.asscalar(val) if isinstance(val,np.number) else val
+                                ,row_fields)
+
+            if j >= N-1 and j % N == N-1:
+                writer.writerows(rows)
+
+        if  j is not None and (j < N-1 or (j % N) != N-1 ):
+            writer.writerows(rows[:((j+1) % N)])
 
-            writer.writerow(row_fields)
 
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,

From d46fa22ec1ad577163f459a943a58eee2e8b183f Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 15 Mar 2013 16:17:16 -0400
Subject: [PATCH 02/27] ENH: to_csv using masking to simplify dtype processing

---
 pandas/core/frame.py       | 34 ++++++++++++++++++++--------------
 pandas/tests/test_frame.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 976b0c7a013f5..c017890d234f6 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1297,8 +1297,25 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
 
         series = {}
         for k, v in self._series.iteritems():
-            series[k] = v.values
-
+            mask = isnull(v)
+            imask = -mask
+            if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
+                values = np.empty(len(v),dtype=object)
+                values[mask] = 'NaT'
+
+                if v.dtype == 'datetime64[ns]':
+                    values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
+                elif v.dtype == 'timedelta64[ns]':
+                    values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
+            else:
+                values = np.array(v.values,dtype=object)
+                values[mask] = na_rep
+                if issubclass(v.dtype.type,np.floating):
+                    if float_format:
+                        values[imask] = np.array([ float_format % val for val in v[imask] ])
+
+            series[k] = values
+ 
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
         if has_aliases or header:
             if index:
@@ -1369,18 +1386,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             row_fields = row_fields_f(idx)
 
             for i, col in (all_cols or enumerate(cols)):
-                val = series[col][j]
-
-
-                if lib.checknull(val):
-                    val = na_rep
-
-                if float_format is not None and com.is_float(val):
-                    val = float_format % val
-                elif isinstance(val, np.datetime64):
-                    val = lib.Timestamp(val)._repr_base
-
-                row_fields[i+nlevels] = val
+                row_fields[i+nlevels] = series[col][j]
 
             rows[ j % N ] = map(lambda val: np.asscalar(val) if isinstance(val,np.number) else val
                                 ,row_fields)
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 1c30dfd1abced..5d270bb037c7a 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -4562,6 +4562,37 @@ def test_to_csv_withcommas(self):
 
         os.remove(path)
 
+    def test_to_csv_mixed(self):
+        filename = '__tmp_to_csv_mixed__.csv'
+        def create_cols(name):
+            return [ "%s%03d" % (name,i) for i in xrange(5) ]
+
+        df_float  = DataFrame(np.random.randn(100, 5),dtype='float64',columns=create_cols('float'))
+        df_int    = DataFrame(np.random.randn(100, 5),dtype='int64',columns=create_cols('int'))
+        df_bool   = DataFrame(True,index=df_float.index,columns=create_cols('bool'))
+        df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object'))
+        df_dt     = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date'))
+
+        # add in some nans
+        df_float.ix[30:50,1:3] = np.nan
+
+        #### this is a bug in read_csv right now ####
+        #df_dt.ix[30:50,1:3] = np.nan
+
+        df        = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1)
+
+        # dtype
+        dtypes = dict()
+        for n,dtype in [('float',np.float64),('int',np.int64),('bool',np.bool),('object',np.object)]:
+            for c in create_cols(n):
+                dtypes[c] = dtype
+
+        df.to_csv(filename)
+
+        rs = pan.read_csv(filename, index_col=0, dtype=dtypes, parse_dates=create_cols('date'))
+        assert_frame_equal(rs, df)
+        os.remove(filename)
+
     def test_to_csv_bug(self):
         path = '__tmp_to_csv_bug__.csv'
         f1 = StringIO('a,1.0\nb,2.0')

From 7c6777675a27e345a99ea93603da695a8cd0df9c Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Sat, 16 Mar 2013 02:06:27 +0200
Subject: [PATCH 03/27] ENH: more perf tweaks in df.to_csv

---
 pandas/core/frame.py | 56 +++++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c017890d234f6..d8d2f07ac0ee3 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1362,37 +1362,49 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             data_index = self.index.to_timestamp()
 
         nlevels = getattr(data_index, 'nlevels', 1)
-
-        spaces = [None] *  len(cols)
-        if index:
-            if nlevels == 1:
-                row_fields_f = lambda x: [x] + spaces
-            else:  # handle MultiIndex
-                row_fields_f = lambda x: list(x) + spaces
-        else:
+        if not index:
             nlevels = 0
-            row_fields_f = lambda x: [None] *  len(cols)
 
         # In crude testing, N>100 yields little marginal improvement
         N=100
-        rows = [None]*N
+
+        # pre-allocate  rows
+        rows = [[None]*(nlevels+len(cols)) for x in range(N)]
 
         all_cols = False
-        if len(cols) < 10000:
+        if len(cols) < 10000: # 10000 as in "usually"
             all_cols = list(enumerate(cols))
 
         j = None
-        for j, idx in enumerate(data_index):
-            row_fields = row_fields_f(idx)
-
-            for i, col in (all_cols or enumerate(cols)):
-                row_fields[i+nlevels] = series[col][j]
-
-            rows[ j % N ] = map(lambda val: np.asscalar(val) if isinstance(val,np.number) else val
-                                ,row_fields)
-
-            if j >= N-1 and j % N == N-1:
-                writer.writerows(rows)
+        if nlevels == 1:
+            for j, idx in enumerate(data_index):
+                row = rows[j % N]
+                row[0] = idx
+                for i, col in (all_cols or enumerate(cols)):
+                    val = series[col][j]
+                    row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+
+                if j >= N-1 and j % N == N-1:
+                    writer.writerows(rows)
+        elif nlevels > 1:
+            for j, idx in enumerate(data_index):
+                row = rows[j % N]
+                row[:nlevels] = list(idx)
+                for i, col in (all_cols or enumerate(cols)):
+                    val = series[col][j]
+                    row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+
+                if j >= N-1 and j % N == N-1:
+                    writer.writerows(rows)
+        else:
+            for j, idx in enumerate(data_index):
+                row = rows[j % N]
+                for i, col in (all_cols or enumerate(cols)):
+                    val = series[col][j]
+                    row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+
+                if j >= N-1 and j % N == N-1:
+                    writer.writerows(rows)
 
         if  j is not None and (j < N-1 or (j % N) != N-1 ):
             writer.writerows(rows[:((j+1) % N)])

From 93496813ccfa6b635b72a307d00f296412e6eda6 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 15 Mar 2013 19:30:36 -0400
Subject: [PATCH 04/27] PERF: cythonized parts of to_csv for increased perf

---
 pandas/core/frame.py | 48 +-------------------------------------------
 pandas/lib.pyx       | 45 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index d8d2f07ac0ee3..896880995feb6 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1362,53 +1362,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             data_index = self.index.to_timestamp()
 
         nlevels = getattr(data_index, 'nlevels', 1)
-        if not index:
-            nlevels = 0
-
-        # In crude testing, N>100 yields little marginal improvement
-        N=100
-
-        # pre-allocate  rows
-        rows = [[None]*(nlevels+len(cols)) for x in range(N)]
-
-        all_cols = False
-        if len(cols) < 10000: # 10000 as in "usually"
-            all_cols = list(enumerate(cols))
-
-        j = None
-        if nlevels == 1:
-            for j, idx in enumerate(data_index):
-                row = rows[j % N]
-                row[0] = idx
-                for i, col in (all_cols or enumerate(cols)):
-                    val = series[col][j]
-                    row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
-
-                if j >= N-1 and j % N == N-1:
-                    writer.writerows(rows)
-        elif nlevels > 1:
-            for j, idx in enumerate(data_index):
-                row = rows[j % N]
-                row[:nlevels] = list(idx)
-                for i, col in (all_cols or enumerate(cols)):
-                    val = series[col][j]
-                    row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
-
-                if j >= N-1 and j % N == N-1:
-                    writer.writerows(rows)
-        else:
-            for j, idx in enumerate(data_index):
-                row = rows[j % N]
-                for i, col in (all_cols or enumerate(cols)):
-                    val = series[col][j]
-                    row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
-
-                if j >= N-1 and j % N == N-1:
-                    writer.writerows(rows)
-
-        if  j is not None and (j < N-1 or (j % N) != N-1 ):
-            writer.writerows(rows[:((j+1) % N)])
-
+        lib.write_csv_rows(series, list(data_index), index, nlevels, list(cols), writer)
 
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 1fd579553f094..051c4e74a60b9 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -784,6 +784,51 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje
 
     return arr
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def write_csv_rows(dict series, list data_index, object index, int nlevels, list cols, object writer):
+    
+    cdef int N, j, i, ncols, ndata_index
+    cdef list rows, row_fields, spaces
+    cdef object v
+
+    ncols = len(cols)
+    spaces = [None] *  len(cols)
+    if index:
+        if nlevels == 1:
+            row_fields_f = lambda x: [x] + spaces
+        else:  # handle MultiIndex
+            row_fields_f = lambda x: list(x) + spaces
+    else:
+        nlevels = 0
+        row_fields_f = lambda x: [None] *  len(cols)
+
+    # In crude testing, N>100 yields little marginal improvement
+    N=100
+    rows = [None]*N
+
+    ndata_index = len(data_index)
+    for j in range(ndata_index):
+       row_fields = row_fields_f(data_index[j])
+
+       for i in range(len(row_fields)):
+           v = row_fields[i]
+           if isinstance(v,np.number):
+              row_fields[i] = np.asscalar(v)
+       for i in range(ncols):
+           v = series[cols[i]][j]
+           if isinstance(v,np.number):
+              v = np.asscalar(v)
+           row_fields[i+nlevels] = v
+
+       rows[ j % N ] = row_fields
+
+       if j >= N-1 and j % N == N-1:
+            writer.writerows(rows)
+
+    if ndata_index and (j < N-1 or (j % N) != N-1 ):
+            writer.writerows(rows[:((j+1) % N)])
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def create_hdf_rows_2d(ndarray indexer0, 

From 10857b089ec66c7931a9336cf945ba397863f53c Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 15 Mar 2013 20:15:11 -0400
Subject: [PATCH 05/27] PERF: more cython tweaks

---
 pandas/lib.pyx | 44 ++++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 051c4e74a60b9..4bb6e2df81642 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -788,40 +788,48 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje
 @cython.wraparound(False)
 def write_csv_rows(dict series, list data_index, object index, int nlevels, list cols, object writer):
     
-    cdef int N, j, i, ncols, ndata_index
-    cdef list rows, row_fields, spaces
-    cdef object v
+    cdef int N, j, i, l, ncols, ndata_index
+    cdef list rows, spaces
+    cdef object v, val
+    cdef ndarray row_fields
 
     ncols = len(cols)
-    spaces = [None] *  len(cols)
-    if index:
-        if nlevels == 1:
-            row_fields_f = lambda x: [x] + spaces
-        else:  # handle MultiIndex
-            row_fields_f = lambda x: list(x) + spaces
-    else:
-        nlevels = 0
-        row_fields_f = lambda x: [None] *  len(cols)
 
     # In crude testing, N>100 yields little marginal improvement
     N=100
     rows = [None]*N
 
     ndata_index = len(data_index)
+
+    if index:
+       row_fields = np.empty(ncols+nlevels,dtype=object)
+    else:
+       nlevels    = 0
+       row_fields = np.empty(ncols,dtype=object)
+
     for j in range(ndata_index):
-       row_fields = row_fields_f(data_index[j])
 
-       for i in range(len(row_fields)):
-           v = row_fields[i]
-           if isinstance(v,np.number):
-              row_fields[i] = np.asscalar(v)
+       if index:
+           if nlevels == 1:
+              v = data_index[j]
+              if isinstance(v,np.number):
+                  v = np.asscalar(v)
+              row_fields[0] = v
+           else:
+              val = data_index[j]
+              for l in range(nlevels):
+                  v = val[l]
+                  if isinstance(v,np.number):
+                      v = np.asscalar(v)
+                  row_fields[l] = v
+
        for i in range(ncols):
            v = series[cols[i]][j]
            if isinstance(v,np.number):
               v = np.asscalar(v)
            row_fields[i+nlevels] = v
 
-       rows[ j % N ] = row_fields
+       rows[ j % N ] = row_fields.copy()
 
        if j >= N-1 and j % N == N-1:
             writer.writerows(rows)

From 6d4e0bb1e959bc9f09bfc493dedbc4dff9528643 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Sat, 16 Mar 2013 03:28:46 +0200
Subject: [PATCH 06/27] PERF: cythonize improved python version

---
 pandas/core/frame.py |  7 +++-
 pandas/lib.pyx       | 88 ++++++++++++++++++++++----------------------
 2 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 896880995feb6..1c69efe487dc1 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1315,7 +1315,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
                         values[imask] = np.array([ float_format % val for val in v[imask] ])
 
             series[k] = values
- 
+
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
         if has_aliases or header:
             if index:
@@ -1362,7 +1362,10 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             data_index = self.index.to_timestamp()
 
         nlevels = getattr(data_index, 'nlevels', 1)
-        lib.write_csv_rows(series, list(data_index), index, nlevels, list(cols), writer)
+        if not index:
+            nlevels = 0
+
+        lib.write_csv_rows(series, list(data_index), nlevels, list(cols), writer)
 
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 4bb6e2df81642..3ecf08df29df6 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -786,56 +786,56 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def write_csv_rows(dict series, list data_index, object index, int nlevels, list cols, object writer):
-    
-    cdef int N, j, i, l, ncols, ndata_index
-    cdef list rows, spaces
-    cdef object v, val
-    cdef ndarray row_fields
+def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer):
 
-    ncols = len(cols)
+    cdef int N, j, i
+    cdef list rows, all_cols
+    cdef object val
 
     # In crude testing, N>100 yields little marginal improvement
     N=100
-    rows = [None]*N
-
-    ndata_index = len(data_index)
 
-    if index:
-       row_fields = np.empty(ncols+nlevels,dtype=object)
+    # pre-allocate  rows
+    rows = [[None]*(nlevels+len(cols)) for x in range(N)]
+
+    all_cols = []
+    if len(cols) < 10000: # 10000 as in "usually"
+        all_cols = list(enumerate(cols))
+
+    j = -1
+    if nlevels == 1:
+        for j, idx in enumerate(data_index):
+            row = rows[j % N]
+            row[0] = idx
+            for i, col in (all_cols or enumerate(cols)):
+                val = series[col][j]
+                row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+
+            if j >= N-1 and j % N == N-1:
+                writer.writerows(rows)
+    elif nlevels > 1:
+        for j, idx in enumerate(data_index):
+            row = rows[j % N]
+            row[:nlevels] = list(idx)
+            for i, col in (all_cols or enumerate(cols)):
+                val = series[col][j]
+                row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+
+            if j >= N-1 and j % N == N-1:
+                writer.writerows(rows)
     else:
-       nlevels    = 0
-       row_fields = np.empty(ncols,dtype=object)
-
-    for j in range(ndata_index):
-
-       if index:
-           if nlevels == 1:
-              v = data_index[j]
-              if isinstance(v,np.number):
-                  v = np.asscalar(v)
-              row_fields[0] = v
-           else:
-              val = data_index[j]
-              for l in range(nlevels):
-                  v = val[l]
-                  if isinstance(v,np.number):
-                      v = np.asscalar(v)
-                  row_fields[l] = v
-
-       for i in range(ncols):
-           v = series[cols[i]][j]
-           if isinstance(v,np.number):
-              v = np.asscalar(v)
-           row_fields[i+nlevels] = v
-
-       rows[ j % N ] = row_fields.copy()
-
-       if j >= N-1 and j % N == N-1:
-            writer.writerows(rows)
-
-    if ndata_index and (j < N-1 or (j % N) != N-1 ):
-            writer.writerows(rows[:((j+1) % N)])
+        for j, idx in enumerate(data_index):
+            row = rows[j % N]
+            for i, col in (all_cols or enumerate(cols)):
+                val = series[col][j]
+                row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+
+            if j >= N-1 and j % N == N-1:
+                writer.writerows(rows)
+
+    if  j >= 0 and (j < N-1 or (j % N) != N-1 ):
+        writer.writerows(rows[:((j+1) % N)])
+
 
 @cython.boundscheck(False)
 @cython.wraparound(False)

From 7ac83ebe5e0e802b490767eb377fc1fab44a21b1 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 15 Mar 2013 19:30:36 -0400
Subject: [PATCH 07/27] PERF: cythonized parts of to_csv for increased perf

---
 pandas/lib.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 3ecf08df29df6..5eaa7375d23ab 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -786,6 +786,7 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
+
 def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer):
 
     cdef int N, j, i

From d78f4f6a6bb03b0f2ac658cac630456c6971a755 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 15 Mar 2013 23:15:40 -0400
Subject: [PATCH 08/27] PERF: more speedups

---
 pandas/core/frame.py |  2 +-
 pandas/lib.pyx       | 36 +++++++++++++++---------------------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 1c69efe487dc1..5a97b25422e7c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1314,7 +1314,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
                     if float_format:
                         values[imask] = np.array([ float_format % val for val in v[imask] ])
 
-            series[k] = values
+            series[k] = values.tolist()
 
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
         if has_aliases or header:
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 5eaa7375d23ab..2bc4eccdb4275 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -789,47 +789,41 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje
 
 def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer):
 
-    cdef int N, j, i
-    cdef list rows, all_cols
+    cdef int N, j, i, ncols
+    cdef list rows
     cdef object val
 
     # In crude testing, N>100 yields little marginal improvement
     N=100
 
     # pre-allocate  rows
-    rows = [[None]*(nlevels+len(cols)) for x in range(N)]
-
-    all_cols = []
-    if len(cols) < 10000: # 10000 as in "usually"
-        all_cols = list(enumerate(cols))
+    ncols = len(cols)
+    rows = [[None]*(nlevels+ncols) for x in range(N)]
 
     j = -1
     if nlevels == 1:
-        for j, idx in enumerate(data_index):
+        for j in range(len(data_index)):
             row = rows[j % N]
-            row[0] = idx
-            for i, col in (all_cols or enumerate(cols)):
-                val = series[col][j]
-                row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+            row[0] = data_index[j]
+            for i in range(ncols):
+                row[nlevels+i] = series[cols[i]][j]
 
             if j >= N-1 and j % N == N-1:
                 writer.writerows(rows)
     elif nlevels > 1:
-        for j, idx in enumerate(data_index):
+        for j in range(len(data_index)):
             row = rows[j % N]
-            row[:nlevels] = list(idx)
-            for i, col in (all_cols or enumerate(cols)):
-                val = series[col][j]
-                row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+            row[:nlevels] = list(data_index[j])
+            for i in range(ncols):
+                row[nlevels+i] = series[cols[i]][j]
 
             if j >= N-1 and j % N == N-1:
                 writer.writerows(rows)
     else:
-        for j, idx in enumerate(data_index):
+        for j in range(len(data_index)):
             row = rows[j % N]
-            for i, col in (all_cols or enumerate(cols)):
-                val = series[col][j]
-                row[nlevels+i] = np.asscalar(val) if isinstance(val,np.number) else val
+            for i in range(ncols):
+                row[nlevels+i] = series[cols[i]][j]
 
             if j >= N-1 and j % N == N-1:
                 writer.writerows(rows)

From 55adfb7bfd16a85b33bbf9a87896057e688f4cbd Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 15 Mar 2013 23:52:14 -0400
Subject: [PATCH 09/27] ENH: add chunksize parameter to DataFrame.to_csv to
 enable constant memory usage      by writing in chunks

---
 RELEASE.rst                |  2 ++
 pandas/core/frame.py       | 70 ++++++++++++++++++++++++--------------
 pandas/tests/test_frame.py | 16 +++++++++
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/RELEASE.rst b/RELEASE.rst
index 2eb7980458f8e..9cd2a620e6fce 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -62,6 +62,8 @@ pandas 0.11.0
     strings that can be parsed with datetime.strptime
   - Add ``axes`` property to ``Series`` for compatibility 
   - Add ``xs`` function to ``Series`` for compatibility 
+  - Add ``chunksize`` parameter to ``to_csv`` to allow writing in chunks
+    to enable constant memory usage
 
 **API Changes**
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 5a97b25422e7c..6a8de3402228a 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1291,31 +1291,11 @@ def to_panel(self):
 
     def _helper_csv(self, writer, na_rep=None, cols=None,
                     header=True, index=True,
-                    index_label=None, float_format=None):
+                    index_label=None, float_format=None,
+                    chunksize=None):
         if cols is None:
             cols = self.columns
 
-        series = {}
-        for k, v in self._series.iteritems():
-            mask = isnull(v)
-            imask = -mask
-            if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
-                values = np.empty(len(v),dtype=object)
-                values[mask] = 'NaT'
-
-                if v.dtype == 'datetime64[ns]':
-                    values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
-                elif v.dtype == 'timedelta64[ns]':
-                    values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
-            else:
-                values = np.array(v.values,dtype=object)
-                values[mask] = na_rep
-                if issubclass(v.dtype.type,np.floating):
-                    if float_format:
-                        values[imask] = np.array([ float_format % val for val in v[imask] ])
-
-            series[k] = values.tolist()
-
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
         if has_aliases or header:
             if index:
@@ -1365,12 +1345,50 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
         if not index:
             nlevels = 0
 
-        lib.write_csv_rows(series, list(data_index), nlevels, list(cols), writer)
+        rows = len(data_index)
+
+        # write in chunksize bites
+        if chunksize is None:
+            chunksize = 100000
+        chunks = int(rows / chunksize)+1
+
+        for i in xrange(chunks):
+            start_i = i * chunksize
+            end_i = min((i + 1) * chunksize, rows)
+            if start_i == end_i:
+                continue
+
+            # create the data for a chunk
+            chunk = self.iloc[start_i:end_i]
+
+            series = {}
+            for k, v in chunk.iteritems():
+                mask = isnull(v)
+                imask = -mask
+
+                if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
+                    values = np.empty(len(v),dtype=object)
+                    values[mask] = 'NaT'
+
+                    if v.dtype == 'datetime64[ns]':
+                        values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
+                    elif v.dtype == 'timedelta64[ns]':
+                        values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
+                else:
+                    values = np.array(v.values,dtype=object)
+                    values[mask] = na_rep
+                    if issubclass(v.dtype.type,np.floating):
+                        if float_format:
+                            values[imask] = np.array([ float_format % val for val in v[imask] ])
+
+                series[k] = values.tolist()
+
+            lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, list(cols), writer)
 
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n'):
+               line_terminator='\n', chunksize=None):
         """
         Write DataFrame to a comma-separated values (csv) file
 
@@ -1407,6 +1425,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
             file
         quoting : optional constant from csv module
             defaults to csv.QUOTE_MINIMAL
+        chunksize : rows to write at a time
         """
         if nanRep is not None:  # pragma: no cover
             import warnings
@@ -1435,7 +1454,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
             self._helper_csv(csvout, na_rep=na_rep,
                              float_format=float_format, cols=cols,
                              header=header, index=index,
-                             index_label=index_label)
+                             index_label=index_label,
+                             chunksize=chunksize)
 
         finally:
             if close:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 5d270bb037c7a..286fb5906f63d 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -4593,6 +4593,22 @@ def create_cols(name):
         assert_frame_equal(rs, df)
         os.remove(filename)
 
+    def test_to_csv_chunking(self):
+        filename = '__tmp_to_csv_chunking__.csv'
+
+        aa=DataFrame({'A':range(100000)})
+
+        aa['B'] = aa.A + 1.0
+        aa['C'] = aa.A + 2.0
+        aa['D'] = aa.A + 3.0
+
+        for chunksize in [10000,50000,100000]:
+            aa.to_csv(filename,chunksize=chunksize)
+            rs = pan.read_csv(filename,index_col=0)
+            assert_frame_equal(rs, aa)
+
+        os.remove(filename)
+
     def test_to_csv_bug(self):
         path = '__tmp_to_csv_bug__.csv'
         f1 = StringIO('a,1.0\nb,2.0')

From dcc45a73b52f03c2bbe1d0a0945ab86b858426d8 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Sat, 16 Mar 2013 10:04:56 +0200
Subject: [PATCH 10/27] CLN: move repeated cast out of loop

---
 pandas/core/frame.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 6a8de3402228a..a52609821ec2b 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1352,6 +1352,8 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             chunksize = 100000
         chunks = int(rows / chunksize)+1
 
+        cols = list(cols)
+
         for i in xrange(chunks):
             start_i = i * chunksize
             end_i = min((i + 1) * chunksize, rows)
@@ -1383,7 +1385,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
 
                 series[k] = values.tolist()
 
-            lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, list(cols), writer)
+            lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, cols, writer)
 
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,

From 5a7c28d80dc37531a9ee9e474472d5349c174f63 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Sat, 16 Mar 2013 10:05:40 +0200
Subject: [PATCH 11/27] CLN: make guard more defensive

---
 pandas/core/frame.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index a52609821ec2b..7c65b105cb054 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1345,20 +1345,20 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
         if not index:
             nlevels = 0
 
-        rows = len(data_index)
+        nrows = len(data_index)
 
         # write in chunksize bites
         if chunksize is None:
             chunksize = 100000
-        chunks = int(rows / chunksize)+1
+        chunks = int(nrows / chunksize)+1
 
         cols = list(cols)
 
         for i in xrange(chunks):
             start_i = i * chunksize
-            end_i = min((i + 1) * chunksize, rows)
-            if start_i == end_i:
-                continue
+            end_i = min((i + 1) * chunksize, nrows)
+            if start_i >= end_i:
+                break
 
             # create the data for a chunk
             chunk = self.iloc[start_i:end_i]

From ba620668ef46c245dbd1e557ab7faa2d8eef3edc Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sat, 16 Mar 2013 10:52:18 +0200
Subject: [PATCH 12/27] REF: add com._ndarray_to_native_types

---
 pandas/core/common.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/pandas/core/common.py b/pandas/core/common.py
index 54b6564badd03..6babf24530f6f 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -101,6 +101,27 @@ def _isnull_old(obj):
 
 _isnull = _isnull_new
 
+# float format is a bit of out of place here,
+# but we'd like to reuse the mask.
+def _ndarray_to_native_types(v,na_rep='',float_format=None):
+    mask = isnull(v)
+    imask = -mask
+
+    if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
+        values = np.empty(len(v),dtype=object)
+        values[mask] = 'NaT'
+
+        if v.dtype == 'datetime64[ns]':
+            values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
+        elif v.dtype == 'timedelta64[ns]':
+            values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
+    else:
+        values = np.array(v.values,dtype=object)
+        values[mask] = na_rep
+        if issubclass(v.dtype.type,np.floating):
+            if float_format:
+                values[imask] = np.array([ float_format % val for val in v[imask] ])
+    return values.tolist()
 
 def _use_inf_as_null(key):
     '''Option change callback for null/inf behaviour

From 6c6f6cfee99bef6174702ffccef9dd0c91a83978 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Sat, 16 Mar 2013 10:54:14 +0200
Subject: [PATCH 13/27] REF: apply native type conv to ix, cols before
 write_csv

---
 pandas/core/frame.py | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 7c65b105cb054..6de1e3012bef1 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -24,7 +24,7 @@
 
 from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
                                 _default_index, _maybe_upcast, _is_sequence,
-                                _infer_dtype_from_scalar)
+                                _infer_dtype_from_scalar, _ndarray_to_native_types)
 from pandas.core.generic import NDFrame
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
@@ -1352,7 +1352,10 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             chunksize = 100000
         chunks = int(nrows / chunksize)+1
 
-        cols = list(cols)
+        if isinstance(cols,np.ndarray):
+            cols = _ndarray_to_native_types(cols,na_rep,float_format)
+        else:
+            cols=list(cols)
 
         for i in xrange(chunks):
             start_i = i * chunksize
@@ -1365,27 +1368,12 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
 
             series = {}
             for k, v in chunk.iteritems():
-                mask = isnull(v)
-                imask = -mask
-
-                if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
-                    values = np.empty(len(v),dtype=object)
-                    values[mask] = 'NaT'
-
-                    if v.dtype == 'datetime64[ns]':
-                        values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
-                    elif v.dtype == 'timedelta64[ns]':
-                        values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
-                else:
-                    values = np.array(v.values,dtype=object)
-                    values[mask] = na_rep
-                    if issubclass(v.dtype.type,np.floating):
-                        if float_format:
-                            values[imask] = np.array([ float_format % val for val in v[imask] ])
+                series[k] = _ndarray_to_native_types(v,na_rep,float_format)
 
-                series[k] = values.tolist()
+            ix = _ndarray_to_native_types(data_index[start_i:end_i],
+                                          na_rep,float_format)
 
-            lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, cols, writer)
+            lib.write_csv_rows(series, ix, nlevels, cols, writer)
 
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,

From 51793db20ebd0f87141ac72d1f07e0e2a9236656 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sat, 16 Mar 2013 16:16:55 -0400
Subject: [PATCH 14/27] PERF: added frame_to_csv2 vbench, revised
 frame_to_csv_mixed

---
 vb_suite/io_bench.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py
index ba386bd0e9649..dc335a4f994d5 100644
--- a/vb_suite/io_bench.py
+++ b/vb_suite/io_bench.py
@@ -44,17 +44,34 @@
 """
 frame_to_csv = Benchmark("df.to_csv('__test__.csv')", setup,
                          start_date=datetime(2011, 1, 1))
+#----------------------------------
+
+setup = common_setup + """
+df=DataFrame({'A':range(100000)})
+df['B'] = df.A + 1.0
+df['C'] = df.A + 2.0
+df['D'] = df.A + 3.0
+"""
+frame_to_csv2 = Benchmark("df.to_csv('__test__.csv')", setup,
+                         start_date=datetime(2011, 1, 1))
 
 #----------------------------------
 setup = common_setup + """
 from pandas import concat, Timestamp
 
-df_float  = DataFrame(np.random.randn(1000, 30),dtype='float64')
-df_int    = DataFrame(np.random.randn(1000, 30),dtype='int64')
-df_bool   = DataFrame(True,index=df_float.index,columns=df_float.columns)
-df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns)
-df_dt     = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns)
+def create_cols(name):
+    return [ "%s%03d" % (name,i) for i in xrange(5) ]
+df_float  = DataFrame(np.random.randn(10000, 5),dtype='float64',columns=create_cols('float'))
+df_int    = DataFrame(np.random.randn(10000, 5),dtype='int64',columns=create_cols('int'))
+df_bool   = DataFrame(True,index=df_float.index,columns=create_cols('bool'))
+df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object'))
+df_dt     = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date'))
+
+# add in some nans
+df_float.ix[30:500,1:3] = np.nan
+
 df        = concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1)
+
 """
 frame_to_csv_mixed = Benchmark("df.to_csv('__test__.csv')", setup,
                                start_date=datetime(2012, 6, 1))

From bb7d1da0d42b575815a4cb1b1d1cbb1055c481bb Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sun, 17 Mar 2013 06:06:03 +0200
Subject: [PATCH 15/27] TST: test for to_csv on failing vbench

duplicate column names across dtypes is a problem, and not-easy to fix,
so letting test fail .
---
 pandas/tests/test_frame.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 286fb5906f63d..9452cc8b8c946 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -4593,6 +4593,19 @@ def create_cols(name):
         assert_frame_equal(rs, df)
         os.remove(filename)
 
+    def test_to_csv_mixed_dups_cols(self):
+        filename = '__tmp_to_csv_mixed_dup_cols__.csv'
+
+        df_float  = DataFrame(np.random.randn(1000, 30),dtype='float64')
+        df_int    = DataFrame(np.random.randn(1000, 30),dtype='int64')
+        df_bool   = DataFrame(True,index=df_float.index,columns=df_float.columns)
+        df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns)
+        df_dt     = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns)
+        df        = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1)
+
+        #### this raises because we have duplicate column names across dtypes ####
+        self.assertRaises(Exception, df.to_csv, filename)
+
     def test_to_csv_chunking(self):
         filename = '__tmp_to_csv_chunking__.csv'
 

From 71cb70d28bbfe9cde99404c5ff260be6013de423 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sun, 17 Mar 2013 06:07:43 +0200
Subject: [PATCH 16/27] ENH: refactor series from dict to list, eliminate one
 level of indirection

---
 pandas/core/frame.py | 8 +++-----
 pandas/lib.pyx       | 8 ++++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 6de1e3012bef1..20a366a3f3662 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1366,14 +1366,12 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             # create the data for a chunk
             chunk = self.iloc[start_i:end_i]
 
-            series = {}
-            for k, v in chunk.iteritems():
-                series[k] = _ndarray_to_native_types(v,na_rep,float_format)
-
+            data = [ _ndarray_to_native_types(v,na_rep,float_format
+                                              ) for k, v in chunk.iteritems() ]
             ix = _ndarray_to_native_types(data_index[start_i:end_i],
                                           na_rep,float_format)
 
-            lib.write_csv_rows(series, ix, nlevels, cols, writer)
+            lib.write_csv_rows(data, ix, nlevels, cols, writer)
 
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 2bc4eccdb4275..850cd7fb97b2d 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -787,7 +787,7 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje
 @cython.boundscheck(False)
 @cython.wraparound(False)
 
-def write_csv_rows(dict series, list data_index, int nlevels, list cols, object writer):
+def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer):
 
     cdef int N, j, i, ncols
     cdef list rows
@@ -806,7 +806,7 @@ def write_csv_rows(dict series, list data_index, int nlevels, list cols, object
             row = rows[j % N]
             row[0] = data_index[j]
             for i in range(ncols):
-                row[nlevels+i] = series[cols[i]][j]
+                row[nlevels+i] = data[i][j]
 
             if j >= N-1 and j % N == N-1:
                 writer.writerows(rows)
@@ -815,7 +815,7 @@ def write_csv_rows(dict series, list data_index, int nlevels, list cols, object
             row = rows[j % N]
             row[:nlevels] = list(data_index[j])
             for i in range(ncols):
-                row[nlevels+i] = series[cols[i]][j]
+                row[nlevels+i] = data[i][j]
 
             if j >= N-1 and j % N == N-1:
                 writer.writerows(rows)
@@ -823,7 +823,7 @@ def write_csv_rows(dict series, list data_index, int nlevels, list cols, object
         for j in range(len(data_index)):
             row = rows[j % N]
             for i in range(ncols):
-                row[nlevels+i] = series[cols[i]][j]
+                row[i] = data[i][j]
 
             if j >= N-1 and j % N == N-1:
                 writer.writerows(rows)

From 7dc5f923e1f10c94716f1040a2294d4811873d27 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Sun, 17 Mar 2013 06:08:54 +0200
Subject: [PATCH 17/27] ENH: replace variable lookup by constant .

this is getting silly.
---
 pandas/lib.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 850cd7fb97b2d..e12b524dda736 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -806,7 +806,7 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr
             row = rows[j % N]
             row[0] = data_index[j]
             for i in range(ncols):
-                row[nlevels+i] = data[i][j]
+                row[1+i] = data[i][j]
 
             if j >= N-1 and j % N == N-1:
                 writer.writerows(rows)

From 66e38f9951923fdd36a9f7a7def85d0b1226c566 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sun, 17 Mar 2013 11:58:20 +0200
Subject: [PATCH 18/27] ENH: make chunks process constant element count

---
 pandas/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 20a366a3f3662..946f98ae26cf4 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1349,7 +1349,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
 
         # write in chunksize bites
         if chunksize is None:
-            chunksize = 100000
+            chunksize = (100000/ (len(cols) or 1)) or 1
         chunks = int(nrows / chunksize)+1
 
         if isinstance(cols,np.ndarray):

From 20d32471baa4c191fb02deb0bb54cd0e5d10edfe Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Sun, 17 Mar 2013 12:01:51 +0200
Subject: [PATCH 19/27] PERF: avoid iteritems->iloc panelty for data
 conversion, use blocks

---
 pandas/core/common.py |  5 +++--
 pandas/core/frame.py  | 28 ++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/pandas/core/common.py b/pandas/core/common.py
index 6babf24530f6f..aff9001f2797c 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -110,13 +110,14 @@ def _ndarray_to_native_types(v,na_rep='',float_format=None):
     if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
         values = np.empty(len(v),dtype=object)
         values[mask] = 'NaT'
-
         if v.dtype == 'datetime64[ns]':
             values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
         elif v.dtype == 'timedelta64[ns]':
             values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
     else:
-        values = np.array(v.values,dtype=object)
+        if hasattr(v,"values"):
+            v= v.values
+        values = np.array(v,dtype=object)
         values[mask] = na_rep
         if issubclass(v.dtype.type,np.floating):
             if float_format:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 946f98ae26cf4..23d1ad08cf9fa 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1296,6 +1296,11 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
         if cols is None:
             cols = self.columns
 
+        if isinstance(cols,np.ndarray):
+            cols = _ndarray_to_native_types(cols,na_rep,float_format)
+        else:
+            cols=list(cols)
+
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
         if has_aliases or header:
             if index:
@@ -1352,11 +1357,6 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
             chunksize = (100000/ (len(cols) or 1)) or 1
         chunks = int(nrows / chunksize)+1
 
-        if isinstance(cols,np.ndarray):
-            cols = _ndarray_to_native_types(cols,na_rep,float_format)
-        else:
-            cols=list(cols)
-
         for i in xrange(chunks):
             start_i = i * chunksize
             end_i = min((i + 1) * chunksize, nrows)
@@ -1364,10 +1364,22 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
                 break
 
             # create the data for a chunk
-            chunk = self.iloc[start_i:end_i]
 
-            data = [ _ndarray_to_native_types(v,na_rep,float_format
-                                              ) for k, v in chunk.iteritems() ]
+            blocks = self._data.blocks
+            data =[None] * sum(len(b.items) for b in blocks)
+            for i in range(len(blocks)):
+                b = blocks[i]
+                v = b.values
+                colname_map = dict((k,i) for i,k in  enumerate(self.columns))
+                if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
+                    d = blocks[i].values[:,start_i:end_i]
+                    for j, k in enumerate(b.items):
+                        data[colname_map[k]] = d[j]
+                else:
+                    d = _ndarray_to_native_types(b.values[:,start_i:end_i],  na_rep,float_format)
+                    for j, k in enumerate(b.items):
+                        data[colname_map[k]] = d[j]
+
             ix = _ndarray_to_native_types(data_index[start_i:end_i],
                                           na_rep,float_format)
 

From 67ca8ae4fe47301df85cbee9eaf8e1a6a72cefa9 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sat, 16 Mar 2013 23:49:08 -0400
Subject: [PATCH 20/27] TST: test for to_csv on failing vbench      duplicate
 column names across dtypes is a problem, and      not-easy to fix, so letting
 test fail

---
 pandas/core/frame.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 23d1ad08cf9fa..e2c053f53dfa4 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1364,7 +1364,6 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
                 break
 
             # create the data for a chunk
-
             blocks = self._data.blocks
             data =[None] * sum(len(b.items) for b in blocks)
             for i in range(len(blocks)):

From 099520871736ad741904d9f65c255831e526e9db Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sun, 17 Mar 2013 18:47:20 +0200
Subject: [PATCH 21/27] CLN: csv refactor

---
 pandas/core/common.py    |  23 ------
 pandas/core/format.py    | 161 ++++++++++++++++++++++++++++++++++++++-
 pandas/core/frame.py     | 134 +++-----------------------------
 pandas/core/index.py     |  44 ++++++++---
 pandas/core/internals.py |  47 +++++++++++-
 5 files changed, 250 insertions(+), 159 deletions(-)

diff --git a/pandas/core/common.py b/pandas/core/common.py
index aff9001f2797c..207ed2edac4bc 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -101,29 +101,6 @@ def _isnull_old(obj):
 
 _isnull = _isnull_new
 
-# float format is a bit of out of place here,
-# but we'd like to reuse the mask.
-def _ndarray_to_native_types(v,na_rep='',float_format=None):
-    mask = isnull(v)
-    imask = -mask
-
-    if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
-        values = np.empty(len(v),dtype=object)
-        values[mask] = 'NaT'
-        if v.dtype == 'datetime64[ns]':
-            values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
-        elif v.dtype == 'timedelta64[ns]':
-            values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
-    else:
-        if hasattr(v,"values"):
-            v= v.values
-        values = np.array(v,dtype=object)
-        values[mask] = na_rep
-        if issubclass(v.dtype.type,np.floating):
-            if float_format:
-                values[imask] = np.array([ float_format % val for val in v[imask] ])
-    return values.tolist()
-
 def _use_inf_as_null(key):
     '''Option change callback for null/inf behaviour
     Choose which replacement for numpy.isnan / -numpy.isfinite is used.
diff --git a/pandas/core/format.py b/pandas/core/format.py
index 003b1fefd01f7..644c08b6b0e54 100644
--- a/pandas/core/format.py
+++ b/pandas/core/format.py
@@ -9,7 +9,7 @@
     from io import StringIO
 
 from pandas.core.common import adjoin, isnull, notnull
-from pandas.core.index import MultiIndex, _ensure_index
+from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.util import py3compat
 from pandas.core.config import get_option, set_option, reset_option
 import pandas.core.common as com
@@ -18,6 +18,7 @@
 import numpy as np
 
 import itertools
+import csv
 
 from pandas.tseries.period import PeriodIndex
 
@@ -763,6 +764,164 @@ def grouper(x):
     return result
 
 
+class CSVFormatter(object):
+
+    def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
+               cols=None, header=True, index=True, index_label=None,
+               mode='w', nanRep=None, encoding=None, quoting=None,
+               line_terminator='\n', chunksize=None):
+
+        self.obj = obj
+        self.path_or_buf = path_or_buf
+        self.sep = sep
+        self.na_rep = na_rep
+        self.float_format = float_format
+
+        self.header = header
+        self.index = index
+        self.index_label = index_label
+        self.mode = mode
+        self.encoding = encoding
+
+        if quoting is None:
+            quoting = csv.QUOTE_MINIMAL
+        self.quoting = quoting
+
+        self.line_terminator = line_terminator
+
+        if cols is None:
+            cols = obj.columns
+
+        if isinstance(cols,Index):
+            cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
+        else:
+            cols=list(cols)
+        self.cols = cols
+        self.colname_map = dict((k,i) for i,k in  enumerate(obj.columns))
+
+        if chunksize is None:
+            chunksize = (100000/ (len(self.cols) or 1)) or 1
+        self.chunksize = chunksize
+
+        self.data_index = obj.index
+        if isinstance(obj.index, PeriodIndex):
+            self.data_index = obj.index.to_timestamp()
+
+        self.nlevels = getattr(self.data_index, 'nlevels', 1)
+        if not index:
+            self.nlevels = 0
+
+    def save(self):
+
+        # create the writer & save
+        if hasattr(self.path_or_buf, 'read'):
+            f = self.path_or_buf
+            close = False
+        else:
+            f = com._get_handle(self.path_or_buf, self.mode, encoding=self.encoding)
+            close = True
+
+        try:
+            if self.encoding is not None:
+                self.writer = com.UnicodeWriter(f, lineterminator=self.line_terminator,
+                                                delimiter=self.sep, encoding=self.encoding,
+                                                quoting=self.quoting)
+            else:
+                self.writer = csv.writer(f, lineterminator=self.line_terminator,
+                                         delimiter=self.sep, quoting=self.quoting)
+                
+            self._save()
+
+        finally:
+            if close:
+                f.close()
+
+    def _save_header(self):
+
+        writer = self.writer
+        obj = self.obj
+        index_label = self.index_label
+        cols = self.cols
+        header = self.header
+
+        has_aliases = isinstance(header, (tuple, list, np.ndarray))
+        if has_aliases or self.header:
+            if self.index:
+                # should write something for index label
+                if index_label is not False:
+                    if index_label is None:
+                        if isinstance(obj.index, MultiIndex):
+                            index_label = []
+                            for i, name in enumerate(obj.index.names):
+                                if name is None:
+                                    name = ''
+                                index_label.append(name)
+                        else:
+                            index_label = obj.index.name
+                            if index_label is None:
+                                index_label = ['']
+                            else:
+                                index_label = [index_label]
+                    elif not isinstance(index_label, (list, tuple, np.ndarray)):
+                        # given a string for a DF with Index
+                        index_label = [index_label]
+
+                    encoded_labels = list(index_label)
+                else:
+                    encoded_labels = []
+
+                if has_aliases:
+                    if len(header) != len(cols):
+                        raise ValueError(('Writing %d cols but got %d aliases'
+                                          % (len(cols), len(header))))
+                    else:
+                        write_cols = header
+                else:
+                    write_cols = cols
+                encoded_cols = list(write_cols)
+
+                writer.writerow(encoded_labels + encoded_cols)
+            else:
+                encoded_cols = list(cols)
+                writer.writerow(encoded_cols)
+
+    def _save(self):
+
+        self._save_header()
+
+        nrows = len(self.data_index)
+
+        # write in chunksize bites
+        chunksize = self.chunksize
+        chunks = int(nrows / chunksize)+1
+
+        for i in xrange(chunks):
+            start_i = i * chunksize
+            end_i = min((i + 1) * chunksize, nrows)
+            if start_i >= end_i:
+                break
+
+            self._save_chunk(start_i, end_i)
+
+    def _save_chunk(self, start_i, end_i):
+
+        colname_map = self.colname_map
+        data_index  = self.data_index
+
+        # create the data for a chunk
+        blocks = self.obj._data.blocks
+        data =[None] * sum(len(b.items) for b in blocks)
+        slicer = slice(start_i,end_i)
+        for i in range(len(blocks)):
+            b = blocks[i]
+            d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
+            for j, k in enumerate(b.items):
+                data[colname_map[k]] = d[j]
+
+        ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
+
+        lib.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer)
+
 # from collections import namedtuple
 # ExcelCell = namedtuple("ExcelCell",
 #                        'row, col, val, style, mergestart, mergeend')
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index e2c053f53dfa4..b2dc6715b0638 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -14,7 +14,6 @@
 
 from itertools import izip
 from StringIO import StringIO
-import csv
 import operator
 import sys
 
@@ -24,7 +23,7 @@
 
 from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
                                 _default_index, _maybe_upcast, _is_sequence,
-                                _infer_dtype_from_scalar, _ndarray_to_native_types)
+                                _infer_dtype_from_scalar)
 from pandas.core.generic import NDFrame
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
@@ -1289,101 +1288,6 @@ def to_panel(self):
 
     to_wide = deprecate('to_wide', to_panel)
 
-    def _helper_csv(self, writer, na_rep=None, cols=None,
-                    header=True, index=True,
-                    index_label=None, float_format=None,
-                    chunksize=None):
-        if cols is None:
-            cols = self.columns
-
-        if isinstance(cols,np.ndarray):
-            cols = _ndarray_to_native_types(cols,na_rep,float_format)
-        else:
-            cols=list(cols)
-
-        has_aliases = isinstance(header, (tuple, list, np.ndarray))
-        if has_aliases or header:
-            if index:
-                # should write something for index label
-                if index_label is not False:
-                    if index_label is None:
-                        if isinstance(self.index, MultiIndex):
-                            index_label = []
-                            for i, name in enumerate(self.index.names):
-                                if name is None:
-                                    name = ''
-                                index_label.append(name)
-                        else:
-                            index_label = self.index.name
-                            if index_label is None:
-                                index_label = ['']
-                            else:
-                                index_label = [index_label]
-                    elif not isinstance(index_label, (list, tuple, np.ndarray)):
-                        # given a string for a DF with Index
-                        index_label = [index_label]
-
-                    encoded_labels = list(index_label)
-                else:
-                    encoded_labels = []
-
-                if has_aliases:
-                    if len(header) != len(cols):
-                        raise ValueError(('Writing %d cols but got %d aliases'
-                                          % (len(cols), len(header))))
-                    else:
-                        write_cols = header
-                else:
-                    write_cols = cols
-                encoded_cols = list(write_cols)
-
-                writer.writerow(encoded_labels + encoded_cols)
-            else:
-                encoded_cols = list(cols)
-                writer.writerow(encoded_cols)
-
-        data_index = self.index
-        if isinstance(self.index, PeriodIndex):
-            data_index = self.index.to_timestamp()
-
-        nlevels = getattr(data_index, 'nlevels', 1)
-        if not index:
-            nlevels = 0
-
-        nrows = len(data_index)
-
-        # write in chunksize bites
-        if chunksize is None:
-            chunksize = (100000/ (len(cols) or 1)) or 1
-        chunks = int(nrows / chunksize)+1
-
-        for i in xrange(chunks):
-            start_i = i * chunksize
-            end_i = min((i + 1) * chunksize, nrows)
-            if start_i >= end_i:
-                break
-
-            # create the data for a chunk
-            blocks = self._data.blocks
-            data =[None] * sum(len(b.items) for b in blocks)
-            for i in range(len(blocks)):
-                b = blocks[i]
-                v = b.values
-                colname_map = dict((k,i) for i,k in  enumerate(self.columns))
-                if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
-                    d = blocks[i].values[:,start_i:end_i]
-                    for j, k in enumerate(b.items):
-                        data[colname_map[k]] = d[j]
-                else:
-                    d = _ndarray_to_native_types(b.values[:,start_i:end_i],  na_rep,float_format)
-                    for j, k in enumerate(b.items):
-                        data[colname_map[k]] = d[j]
-
-            ix = _ndarray_to_native_types(data_index[start_i:end_i],
-                                          na_rep,float_format)
-
-            lib.write_csv_rows(data, ix, nlevels, cols, writer)
-
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
@@ -1432,33 +1336,15 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                           FutureWarning)
             na_rep = nanRep
 
-        if hasattr(path_or_buf, 'read'):
-            f = path_or_buf
-            close = False
-        else:
-            f = com._get_handle(path_or_buf, mode, encoding=encoding)
-            close = True
-
-        if quoting is None:
-            quoting = csv.QUOTE_MINIMAL
-
-        try:
-            if encoding is not None:
-                csvout = com.UnicodeWriter(f, lineterminator=line_terminator,
-                                           delimiter=sep, encoding=encoding,
-                                           quoting=quoting)
-            else:
-                csvout = csv.writer(f, lineterminator=line_terminator,
-                                    delimiter=sep, quoting=quoting)
-            self._helper_csv(csvout, na_rep=na_rep,
-                             float_format=float_format, cols=cols,
-                             header=header, index=index,
-                             index_label=index_label,
-                             chunksize=chunksize)
-
-        finally:
-            if close:
-                f.close()
+        formatter = fmt.CSVFormatter(self, path_or_buf, 
+                                     line_terminator=line_terminator,
+                                     sep=sep, encoding=encoding,
+                                     quoting=quoting,na_rep=na_rep,
+                                     float_format=float_format, cols=cols,
+                                     header=header, index=index,
+                                     index_label=index_label,
+                                     chunksize=chunksize)
+        formatter.save()
 
     def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',
                  float_format=None, cols=None, header=True, index=True,
diff --git a/pandas/core/index.py b/pandas/core/index.py
index 0f9776e202c00..95e6c40a9dad8 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -441,16 +441,7 @@ def format(self, name=False, formatter=None, na_rep='NaN'):
             return header + list(self.map(formatter))
 
         if self.is_all_dates:
-            zero_time = time(0, 0)
-            result = []
-            for dt in self:
-                if isnull(dt):
-                    result.append(u'NaT')
-                else:
-                    if dt.time() != zero_time or dt.tzinfo is not None:
-                        return header + [u'%s' % x for x in self]
-                    result.append(u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day))
-            return header + result
+            return header + _date_formatter(self)
 
         values = self.values
 
@@ -472,6 +463,20 @@ def format(self, name=False, formatter=None, na_rep='NaN'):
             result = _trim_front(format_array(values, None, justify='left'))
         return header + result
 
+    def to_native_types(self, slicer=None, na_rep='', float_format=None):
+        values = self
+        if slicer is not None:
+            values = values[slicer]
+        mask = isnull(values)
+        values = np.array(values,dtype=object)
+
+        if self.is_all_dates:
+            return _date_formatter(self)
+        else:
+            values[mask] = na_rep
+
+        return values.tolist()
+
     def equals(self, other):
         """
         Determines if two Index objects contain the same elements.
@@ -1481,6 +1486,9 @@ def __repr__(self):
     def __len__(self):
         return len(self.labels[0])
 
+    def to_native_types(self, slicer=None, na_rep='', float_format=None):
+        return self.tolist()
+
     @property
     def _constructor(self):
         return MultiIndex.from_tuples
@@ -2578,6 +2586,22 @@ def _wrap_joined_index(self, joined, other):
 
 # For utility purposes
 
+def _date_formatter(obj, na_rep=u'NaT'):
+    data = list(obj)
+
+    # tz formatter or time formatter
+    zero_time = time(0, 0)
+    for d in data:
+        if d.time() != zero_time or d.tzinfo is not None:
+            return [u'%s' % x for x in data ]
+
+    values = np.array(data,dtype=object)
+    mask = isnull(obj.values)
+    values[mask] = na_rep
+
+    imask = -mask
+    values[imask] = np.array([ u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day) for dt in values[imask] ])
+    return values.tolist()
 
 def _sparsify(label_list, start=0):
     pivoted = zip(*label_list)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 2a41bbffa3b83..3467b72541481 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -4,13 +4,14 @@
 from numpy import nan
 import numpy as np
 
-from pandas.core.common import _possibly_downcast_to_dtype
+from pandas.core.common import isnull, _possibly_downcast_to_dtype
 from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
 from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
 import pandas.core.common as com
 import pandas.lib as lib
 import pandas.tslib as tslib
 
+from pandas.tslib import Timestamp
 from pandas.util import py3compat
 
 
@@ -259,6 +260,17 @@ def _try_cast_result(self, result):
         we may have roundtripped thru object in the mean-time """
         return result
 
+    def to_native_types(self, slicer=None, na_rep='', **kwargs):
+        """ convert to our native types format, slicing if desired """
+
+        values = self.values
+        if slicer is not None:
+            values = values[:,slicer]
+        values = np.array(values,dtype=object)
+        mask = isnull(values)
+        values[mask] = na_rep
+        return values.tolist()
+
     def replace(self, to_replace, value, inplace=False):
         new_values = self.values if inplace else self.values.copy()
         if self._can_hold_element(value):
@@ -577,6 +589,20 @@ def _try_cast(self, element):
         except:  # pragma: no cover
             return element
 
+    def to_native_types(self, slicer=None, na_rep='', float_format=None, **kwargs):
+        """ convert to our native types format, slicing if desired """
+
+        values = self.values
+        if slicer is not None:
+            values = values[:,slicer]
+        values = np.array(values,dtype=object)
+        mask = isnull(values)
+        values[mask] = na_rep
+        if float_format:
+            imask = (-mask).ravel()
+            values.flat[imask] = np.array([ float_format % val for val in values.ravel()[imask] ])
+        return values.tolist()
+
     def should_store(self, value):
         # when inserting a column should not coerce integers to floats
         # unnecessarily
@@ -701,6 +727,25 @@ def _try_cast(self, element):
         except:
             return element
 
+    def to_native_types(self, slicer=None, na_rep=None, **kwargs):
+        """ convert to our native types format, slicing if desired """
+
+        values = self.values
+        if slicer is not None:
+            values = values[:,slicer]
+        mask = isnull(values)
+
+        rvalues = np.empty(self.shape,dtype=object)
+        if na_rep is None:
+            na_rep = 'NaT'
+        rvalues[mask] = na_rep
+        imask = (-mask).ravel()
+        if self.dtype == 'datetime64[ns]':
+            rvalues.flat[imask] = np.array([ Timestamp(val)._repr_base for val in values.ravel()[imask] ],dtype=object)
+        elif self.dtype == 'timedelta64[ns]':
+            rvalues.flat[imask] = np.array([ lib.repr_timedelta64(val) for val in values.ravel()[imask] ],dtype=object)
+        return rvalues.tolist()
+
     def should_store(self, value):
         return issubclass(value.dtype.type, np.datetime64)
 

From 77761288ad4d04f40826252b8556e2f360cda5bc Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Mon, 18 Mar 2013 18:18:01 +0200
Subject: [PATCH 22/27] ENH: add (undocumented) legacy kwd to df.to_csv, just
 in case

---
 pandas/core/format.py | 96 +++++++++++++++++++++++++++++++++++++++++--
 pandas/core/frame.py  | 22 +++++-----
 2 files changed, 104 insertions(+), 14 deletions(-)

diff --git a/pandas/core/format.py b/pandas/core/format.py
index 644c08b6b0e54..2237160efd941 100644
--- a/pandas/core/format.py
+++ b/pandas/core/format.py
@@ -769,8 +769,9 @@ class CSVFormatter(object):
     def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n', chunksize=None):
+               line_terminator='\n', chunksize=None,legacy=False):
 
+        self.legacy=legacy # remove for 0.12
         self.obj = obj
         self.path_or_buf = path_or_buf
         self.sep = sep
@@ -811,8 +812,86 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
         if not index:
             self.nlevels = 0
 
-    def save(self):
+    # legacy to be removed in 0.12
+    def _helper_csv(self, writer, na_rep=None, cols=None,
+                    header=True, index=True,
+                    index_label=None, float_format=None):
+        if cols is None:
+            cols = self.columns
+
+        series = {}
+        for k, v in self.obj._series.iteritems():
+            series[k] = v.values
+
+
+        has_aliases = isinstance(header, (tuple, list, np.ndarray))
+        if has_aliases or header:
+            if index:
+                # should write something for index label
+                if index_label is not False:
+                    if index_label is None:
+                        if isinstance(self.obj.index, MultiIndex):
+                            index_label = []
+                            for i, name in enumerate(self.obj.index.names):
+                                if name is None:
+                                    name = ''
+                                index_label.append(name)
+                        else:
+                            index_label = self.obj.index.name
+                            if index_label is None:
+                                index_label = ['']
+                            else:
+                                index_label = [index_label]
+                    elif not isinstance(index_label, (list, tuple, np.ndarray)):
+                        # given a string for a DF with Index
+                        index_label = [index_label]
+
+                    encoded_labels = list(index_label)
+                else:
+                    encoded_labels = []
 
+                if has_aliases:
+                    if len(header) != len(cols):
+                        raise ValueError(('Writing %d cols but got %d aliases'
+                                          % (len(cols), len(header))))
+                    else:
+                        write_cols = header
+                else:
+                    write_cols = cols
+                encoded_cols = list(write_cols)
+
+                writer.writerow(encoded_labels + encoded_cols)
+            else:
+                encoded_cols = list(cols)
+                writer.writerow(encoded_cols)
+
+        data_index = self.obj.index
+        if isinstance(self.obj.index, PeriodIndex):
+            data_index = self.obj.index.to_timestamp()
+
+        nlevels = getattr(data_index, 'nlevels', 1)
+        for j, idx in enumerate(data_index):
+            row_fields = []
+            if index:
+                if nlevels == 1:
+                    row_fields = [idx]
+                else: # handle MultiIndex
+                    row_fields = list(idx)
+            for i, col in enumerate(cols):
+                val = series[col][j]
+                if lib.checknull(val):
+                    val = na_rep
+
+                if float_format is not None and com.is_float(val):
+                    val = float_format % val
+                elif isinstance(val, np.datetime64):
+                    val = lib.Timestamp(val)._repr_base
+
+                row_fields.append(val)
+
+            writer.writerow(row_fields)
+
+    def save(self):
         # create the writer & save
         if hasattr(self.path_or_buf, 'read'):
             f = self.path_or_buf
@@ -829,8 +908,17 @@ def save(self):
             else:
                 self.writer = csv.writer(f, lineterminator=self.line_terminator,
                                          delimiter=self.sep, quoting=self.quoting)
-                
-            self._save()
+
+
+            if self.legacy:
+            # to be removed in 0.12
+                self._helper_csv(self.writer, na_rep=self.na_rep,
+                                 float_format=self.float_format, cols=self.cols,
+                                 header=self.header, index=self.index,
+                                 index_label=self.index_label)
+
+            else:
+                self._save()
 
         finally:
             if close:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b2dc6715b0638..7cfb9ec03ba83 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1291,7 +1291,7 @@ def to_panel(self):
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n', chunksize=None):
+               line_terminator='\n', chunksize=None,**kwds):
         """
         Write DataFrame to a comma-separated values (csv) file
 
@@ -1336,15 +1336,17 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                           FutureWarning)
             na_rep = nanRep
 
-        formatter = fmt.CSVFormatter(self, path_or_buf, 
-                                     line_terminator=line_terminator,
-                                     sep=sep, encoding=encoding,
-                                     quoting=quoting,na_rep=na_rep,
-                                     float_format=float_format, cols=cols,
-                                     header=header, index=index,
-                                     index_label=index_label,
-                                     chunksize=chunksize)
-        formatter.save()
+
+        else:
+            formatter = fmt.CSVFormatter(self, path_or_buf,
+                                         line_terminator=line_terminator,
+                                         sep=sep, encoding=encoding,
+                                         quoting=quoting,na_rep=na_rep,
+                                         float_format=float_format, cols=cols,
+                                         header=header, index=index,
+                                         index_label=index_label,
+                                         chunksize=chunksize,legacy=kwds.get("legacy",False) )
+            formatter.save()
 
     def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',
                  float_format=None, cols=None, header=True, index=True,

From 0e42e46d92da245736ffcba32c15d91b7d7d786e Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Mon, 18 Mar 2013 13:38:46 -0400
Subject: [PATCH 23/27] TST: fail early on duplicate columns

---
 pandas/core/format.py      | 8 ++++++--
 pandas/tests/test_frame.py | 7 +++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pandas/core/format.py b/pandas/core/format.py
index 2237160efd941..59e34709caecb 100644
--- a/pandas/core/format.py
+++ b/pandas/core/format.py
@@ -770,7 +770,6 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
                line_terminator='\n', chunksize=None,legacy=False):
-
         self.legacy=legacy # remove for 0.12
         self.obj = obj
         self.path_or_buf = path_or_buf
@@ -798,6 +797,11 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
         else:
             cols=list(cols)
         self.cols = cols
+
+        # fail early if we have duplicate columns
+        if len(set(self.cols)) != len(self.cols):
+            raise Exception("duplicate columns are not permitted in to_csv")
+
         self.colname_map = dict((k,i) for i,k in  enumerate(obj.columns))
 
         if chunksize is None:
@@ -909,7 +913,6 @@ def save(self):
                 self.writer = csv.writer(f, lineterminator=self.line_terminator,
                                          delimiter=self.sep, quoting=self.quoting)
 
-
             if self.legacy:
             # to be removed in 0.12
                 self._helper_csv(self.writer, na_rep=self.na_rep,
@@ -920,6 +923,7 @@ def save(self):
             else:
                 self._save()
 
+
         finally:
             if close:
                 f.close()
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 9452cc8b8c946..aeda07e558d7d 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -4593,8 +4593,11 @@ def create_cols(name):
         assert_frame_equal(rs, df)
         os.remove(filename)
 
-    def test_to_csv_mixed_dups_cols(self):
-        filename = '__tmp_to_csv_mixed_dup_cols__.csv'
+    def test_to_csv_dups_cols(self):
+        filename = '__tmp_to_csv_dup_cols__.csv'
+
+        df        = DataFrame(np.random.randn(1000, 30),columns=range(15)+range(15),dtype='float64')
+        self.assertRaises(Exception, df.to_csv, filename)
 
         df_float  = DataFrame(np.random.randn(1000, 30),dtype='float64')
         df_int    = DataFrame(np.random.randn(1000, 30),dtype='int64')

From 616347c98a1b3d0cd04d09f57f84ffb171323605 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Tue, 19 Mar 2013 09:15:05 +0200
Subject: [PATCH 24/27] CLN: preallocate data array only once

---
 pandas/core/format.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pandas/core/format.py b/pandas/core/format.py
index 59e34709caecb..ef14c830e1c37 100644
--- a/pandas/core/format.py
+++ b/pandas/core/format.py
@@ -798,6 +798,11 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
             cols=list(cols)
         self.cols = cols
 
+        # preallocate data 2d list
+        self.blocks = self.obj._data.blocks
+        ncols = sum(len(b.items) for b in self.blocks)
+        self.data =[None] * ncols
+
         # fail early if we have duplicate columns
         if len(set(self.cols)) != len(self.cols):
             raise Exception("duplicate columns are not permitted in to_csv")
@@ -1001,18 +1006,17 @@ def _save_chunk(self, start_i, end_i):
         data_index  = self.data_index
 
         # create the data for a chunk
-        blocks = self.obj._data.blocks
-        data =[None] * sum(len(b.items) for b in blocks)
         slicer = slice(start_i,end_i)
-        for i in range(len(blocks)):
-            b = blocks[i]
+        for i in range(len(self.blocks)):
+            b = self.blocks[i]
             d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
             for j, k in enumerate(b.items):
-                data[colname_map[k]] = d[j]
+                # self.data is a preallocated list
+                self.data[colname_map[k]] = d[j]
 
         ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
 
-        lib.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer)
+        lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
 
 # from collections import namedtuple
 # ExcelCell = namedtuple("ExcelCell",

From 87a391ecfba20fd181e3bb883026cf4f213be6e1 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Tue, 19 Mar 2013 07:28:41 +0200
Subject: [PATCH 25/27] TST: test the hell out of the new df.to_csv()

---
 pandas/tests/test_frame.py | 109 +++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index aeda07e558d7d..7051c193dffd4 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -4450,6 +4450,115 @@ def test_to_csv_from_csv(self):
 
         os.remove(path)
 
+    def test_to_csv_moar(self):
+        from pandas.util.testing import makeCustomDataframe as mkdf
+        path = '__tmp_to_csv_dupe_cols__'
+        def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None):
+               try:
+                    df.to_csv(path,encoding='utf8')
+                    recons = DataFrame.from_csv(path)
+               except:
+                    os.remove(path)
+                    raise
+               else:
+                    def _to_uni(x):
+                        if not isinstance(x,unicode):
+                            return x.decode('utf8')
+                        return x
+                    if rnlvl:
+                        delta_lvl = [recons.icol(i).values for i in range(rnlvl-1)]
+                        ix=MultiIndex.from_arrays([list(recons.index)]+delta_lvl)
+                        recons.index = ix
+                        recons = recons.iloc[:,rnlvl-1:]
+
+                    if cnlvl:
+                        def stuple_to_tuple(x):
+                            import re
+                            x = x.split(",")
+                            x = map(lambda x: re.sub("[\'\"\s\(\)]","",x),x)
+                            return x
+
+                        cols=MultiIndex.from_tuples(map(stuple_to_tuple,recons.columns))
+                        recons.columns = cols
+
+                    type_map = dict(i='i',f='f',s='O',u='O',dt='O')
+                    if r_dtype:
+                         if r_dtype == 'u': # unicode
+                             r_dtype='O'
+                             recons.index = np.array(map(_to_uni,recons.index),
+                                                     dtype=r_dtype )
+                             df.index = np.array(map(_to_uni,df.index),dtype=r_dtype )
+                         if r_dtype == 'dt': # unicode
+                             r_dtype='O'
+                             recons.index = np.array(map(Timestamp,recons.index),
+                                                     dtype=r_dtype )
+                             df.index = np.array(map(Timestamp,df.index),dtype=r_dtype )
+                         else:
+                             r_dtype= type_map.get(r_dtype)
+                             recons.index = np.array(recons.index,dtype=r_dtype )
+                             df.index = np.array(df.index,dtype=r_dtype )
+                    if c_dtype:
+                         if c_dtype == 'u':
+                             c_dtype='O'
+                             recons.columns = np.array(map(_to_uni,recons.columns),
+                                                     dtype=c_dtype )
+                             df.Columns = np.array(map(_to_uni,df.columns),dtype=c_dtype )
+                         elif c_dtype == 'dt':
+                             c_dtype='O'
+                             recons.columns = np.array(map(Timestamp,recons.columns),
+                                                     dtype=c_dtype )
+                             df.Columns = np.array(map(Timestamp,df.columns),dtype=c_dtype )
+                         else:
+                             c_dtype= type_map.get(c_dtype)
+                             recons.columns = np.array(recons.columns,dtype=c_dtype )
+                             df.columns = np.array(df.columns,dtype=c_dtype )
+
+                    assert_frame_equal(df, recons,check_names=False)
+
+        N = 100
+
+        for ncols in [1,10,30]:
+            base = int((100000/ ncols or 1) or 1)
+            for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2,
+                      base-1,base,base+1]:
+                print( nrows,ncols)
+                _do_test(mkdf(nrows, ncols),path)
+
+        for nrows in [10,N-2,N-1,N,N+1,N+2]:
+            df = mkdf(nrows, 10)
+            cols = list(df.columns)
+            cols[:1] = ["dupe","dupe"]
+            cols[-1:] = ["dupe","dupe"]
+            ix = list(df.index)
+            ix[:2] = ["rdupe","rdupe"]
+            ix[-2:] = ["rdupe","rdupe"]
+            print( nrows)
+
+            df.index=ix
+            _do_test(df,path)
+
+        for r_idx_type in ['i', 'f','s','u','dt']:
+            for c_idx_type in ['i', 'f','s','u','dt']:
+                print(r_idx_type,c_idx_type)
+                _do_test(mkdf(100, 1,r_idx_type=r_idx_type,
+                              c_idx_type=c_idx_type),path,r_idx_type,c_idx_type)
+                _do_test(mkdf(100, 2,r_idx_type=r_idx_type,
+                               c_idx_type=c_idx_type),path,r_idx_type,c_idx_type)
+
+        _do_test(DataFrame(index=range(10)),path)
+        _do_test(mkdf(50001, 2,r_idx_nlevels=2),path,rnlvl=2)
+        for ncols in [2,10,30]:
+            base = int(100000/ncols)
+            for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2,
+                      base-1,base,base+1]:
+                print(nrows, ncols)
+                _do_test(mkdf(nrows, ncols,r_idx_nlevels=2),path,rnlvl=2)
+                _do_test(mkdf(nrows, ncols,c_idx_nlevels=2),path,cnlvl=2)
+                _do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2),
+                         path,rnlvl=2,cnlvl=2)
+
+
+
     def test_to_csv_from_csv_w_some_infs(self):
         path = '__%s__' % tm.rands(10)
 

From 22f258ffaf33b3988845839b06a728dd10475550 Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Tue, 19 Mar 2013 09:27:45 +0200
Subject: [PATCH 26/27] BUG: MultiIndex to_native_types did not obey slicer

---
 pandas/core/index.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/core/index.py b/pandas/core/index.py
index 95e6c40a9dad8..8b42f2146a7cf 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -1487,7 +1487,10 @@ def __len__(self):
         return len(self.labels[0])
 
     def to_native_types(self, slicer=None, na_rep='', float_format=None):
-        return self.tolist()
+        ix = self
+        if slicer:
+            ix = self[slicer]
+        return ix.tolist()
 
     @property
     def _constructor(self):

From 4d9a3d357bfc986ee1e86f74623f606be1afacdd Mon Sep 17 00:00:00 2001
From: y-p <yoval@gmx.com>
Date: Tue, 19 Mar 2013 09:41:31 +0200
Subject: [PATCH 27/27] DOC: update what's new, RELEASE.rst

---
 RELEASE.rst            | 2 ++
 doc/source/v0.11.0.txt | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/RELEASE.rst b/RELEASE.rst
index 9cd2a620e6fce..51fdd527afdfa 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -47,6 +47,7 @@ pandas 0.11.0
 
 **Improvements to existing features**
 
+  - Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_)
   - added ``blocks`` attribute to DataFrames, to return a dict of dtypes to
     homogeneously dtyped DataFrames
   - added keyword ``convert_numeric`` to ``convert_objects()`` to try to
@@ -185,6 +186,7 @@ pandas 0.11.0
 .. _GH3012: https://github.com/pydata/pandas/issues/3012
 .. _GH3029: https://github.com/pydata/pandas/issues/3029
 .. _GH3041: https://github.com/pydata/pandas/issues/3041
+.. _GH3059: https://github.com/pydata/pandas/issues/3039
 
 
 pandas 0.10.1
diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt
index 60ec7de5c4d8e..09289bab5a0f4 100644
--- a/doc/source/v0.11.0.txt
+++ b/doc/source/v0.11.0.txt
@@ -229,6 +229,8 @@ API changes
 Enhancements
 ~~~~~~~~~~~~
 
+  - Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_)
+
   - Numexpr is now a :ref:`Recommended Dependencies <install.recommended_dependencies>`, to accelerate certain
     types of numerical and boolean operations
 
@@ -331,3 +333,4 @@ on GitHub for a complete list.
 .. _GH2806: https://github.com/pydata/pandas/issues/2806
 .. _GH2807: https://github.com/pydata/pandas/issues/2807
 .. _GH2918: https://github.com/pydata/pandas/issues/2918
+.. _GH3059: https://github.com/pydata/pandas/issues/3059