From fb204a232307d094ecb76789c0528be295361465 Mon Sep 17 00:00:00 2001
From: Scott E Lasley <slasley@umd.edu>
Date: Mon, 25 Jan 2016 12:06:11 -0500
Subject: [PATCH 1/3] BUG: parser buffer could be freed more than once if
 reading failed in buffer_rd_bytes, causing a segfault

Closes #12098
---
 doc/source/whatsnew/v0.18.0.txt |  2 ++
 pandas/io/tests/test_parsers.py | 38 +++++++++++++++++++++++++++++++++
 pandas/src/parser/io.c          |  1 +
 3 files changed, 41 insertions(+)

diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
index 81696982d0fde..48fb56213fa6a 100644
--- a/doc/source/whatsnew/v0.18.0.txt
+++ b/doc/source/whatsnew/v0.18.0.txt
@@ -545,3 +545,5 @@ of columns didn't match the number of series provided (:issue:`12039`).
 - Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`)
 
 - Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`)
+
+- Bug in ``buffer_rd_bytes`` src->buffer could be freed more than once if reading failed, causing a segfault (:issue:`12098`) 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 11ccb0eba8f72..0797354c1a92e 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -3667,6 +3667,25 @@ def test_buffer_overflow(self):
                 self.assertIn(
                     'Buffer overflow caught - possible malformed input file.', str(cperr))
 
+    def test_buffer_rd_bytes(self):
+        # GH 12098
+        # src->buffer can be freed twice leading to a segfault if a corrupt 
+        # gzip file is read with read_csv and the buffer is filled more than
+        # once before gzip throws an exception
+        
+        data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
+               '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
+               '\xA6\x4D' + '\x55' * 267 + \
+               '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
+               '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
+        for i in range(100):
+            try:
+                _ = self.read_csv(StringIO(data),
+                                  compression='gzip',
+                                  delim_whitespace=True)
+            except Exception as e:
+                pass
+
     def test_single_char_leading_whitespace(self):
         # GH 9710
         data = """\
@@ -4208,6 +4227,25 @@ def test_buffer_overflow(self):
                 self.assertIn(
                     'Buffer overflow caught - possible malformed input file.', str(cperr))
 
+    def test_buffer_rd_bytes(self):
+        # GH 12098
+        # src->buffer can be freed twice leading to a segfault if a corrupt 
+        # gzip file is read with read_csv and the buffer is filled more than
+        # once before gzip throws an exception
+        
+        data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
+               '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
+               '\xA6\x4D' + '\x55' * 267 + \
+               '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
+               '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
+        for i in range(100):
+            try:
+                _ = self.read_csv(StringIO(data),
+                                  compression='gzip',
+                                  delim_whitespace=True)
+            except Exception as e:
+                pass
+
     def test_single_char_leading_whitespace(self):
         # GH 9710
         data = """\
diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c
index 0297d1ba49527..566de72804968 100644
--- a/pandas/src/parser/io.c
+++ b/pandas/src/parser/io.c
@@ -121,6 +121,7 @@ void* buffer_rd_bytes(void *source, size_t nbytes,
 
     /* delete old object */
     Py_XDECREF(src->buffer);
+    src->buffer = NULL;
     args = Py_BuildValue("(i)", nbytes);
 
     func = PyObject_GetAttrString(src->obj, "read");

From d77f0724b21cb290a0daec0202f2119543644236 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Tue, 26 Jan 2016 15:32:48 -0500
Subject: [PATCH 2/3] DOC: whatsnew edits

---
 doc/source/whatsnew/v0.18.0.txt | 60 ++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
index 81696982d0fde..ccdc48bc1dbbb 100644
--- a/doc/source/whatsnew/v0.18.0.txt
+++ b/doc/source/whatsnew/v0.18.0.txt
@@ -201,10 +201,6 @@ In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available thru t
    s
    s.dt.round('D')
 
-.. _whatsnew_0180.api:
-
-- ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`)
-
 .. _whatsnew_0180.api_breaking:
 
 Backwards incompatible API changes
@@ -319,29 +315,6 @@ other anchored offsets like ``MonthBegin`` and ``YearBegin``.
    d = pd.Timestamp('2014-02-15')
    d + pd.offsets.QuarterBegin(n=0, startingMonth=2)
 
-
-Other API Changes
-^^^^^^^^^^^^^^^^^
-
-- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`)
-
-  .. ipython:: python
-
-     s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10))
-     s.between_time("7:00am", "9:00am")
-
-  This will now raise.
-
-  .. code-block:: python
-
-     In [2]: s.between_time('20150101 07:00:00','20150101 09:00:00')
-     ValueError: Cannot convert arg ['20150101 07:00:00'] to a time.
-
-- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)
-
-- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`)
-
-
 Changes to eval
 ^^^^^^^^^^^^^^^
 
@@ -397,6 +370,32 @@ assignments are valid for multi-line expressions.
    g = f / 2.0""", inplace=True)
    df
 
+
+.. _whatsnew_0180.api:
+
+Other API Changes
+^^^^^^^^^^^^^^^^^
+
+- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`)
+
+  .. ipython:: python
+
+     s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10))
+     s.between_time("7:00am", "9:00am")
+
+  This will now raise.
+
+  .. code-block:: python
+
+     In [2]: s.between_time('20150101 07:00:00','20150101 09:00:00')
+     ValueError: Cannot convert arg ['20150101 07:00:00'] to a time.
+
+- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)
+
+- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`)
+
+- ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`)
+
 .. _whatsnew_0180.deprecations:
 
 Deprecations
@@ -502,7 +501,7 @@ Bug Fixes
 - Bug in ``pd.read_clipboard`` and ``pd.to_clipboard`` functions not supporting Unicode; upgrade included ``pyperclip`` to v1.5.15 (:issue:`9263`)
 - Bug in ``DataFrame.query`` containing an assignment (:issue:`8664`)
 
-- Bug in ``from_msgpack`` where ``__contains__()`` fails for columns of the unpacked ``DataFrame``, if the ``DataFrame`` has object columns. (:issue: `11880`)
+- Bug in ``from_msgpack`` where ``__contains__()`` fails for columns of the unpacked ``DataFrame``, if the ``DataFrame`` has object columns. (:issue:`11880`)
 
 
 - Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`)
@@ -521,7 +520,7 @@ Bug Fixes
 - Bug in ``Index`` prevents copying name of passed ``Index``, when a new name is not provided (:issue:`11193`)
 - Bug in ``read_excel`` failing to read any non-empty sheets when empty sheets exist and ``sheetname=None`` (:issue:`11711`)
 - Bug in ``read_excel`` failing to raise ``NotImplemented`` error when keywords ``parse_dates`` and ``date_parser`` are provided (:issue:`11544`)
-- Bug in ``read_sql`` with pymysql connections failing to return chunked data (:issue:`11522`)
+- Bug in ``read_sql`` with ``pymysql`` connections failing to return chunked data (:issue:`11522`)
 - Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`)
 - Bug in ``Int64Index`` and ``Float64Index`` preventing the use of the modulo operator (:issue:`9244`)
 
@@ -529,8 +528,7 @@ Bug Fixes
 - Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`)
 
 
-- Bug in ``.plot`` potentially modifying the ``colors`` input when the number
-of columns didn't match the number of series provided (:issue:`12039`).
+- Bug in ``.plot`` potentially modifying the ``colors`` input when the number of columns didn't match the number of series provided (:issue:`12039`).
 
 
 - Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`)

From a1f0a79e059a807fcbdde3714152385e651a21c0 Mon Sep 17 00:00:00 2001
From: Scott E Lasley <slasley@umd.edu>
Date: Mon, 25 Jan 2016 12:06:11 -0500
Subject: [PATCH 3/3] BUG: parser buffer could be freed more than once if
 reading failed in buffer_rd_bytes, causing a segfault

Closes #12098
---
 doc/source/whatsnew/v0.18.0.txt |  2 ++
 pandas/io/tests/test_parsers.py | 38 +++++++++++++++++++++++++++++++++
 pandas/src/parser/io.c          |  1 +
 3 files changed, 41 insertions(+)

diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
index ccdc48bc1dbbb..abca5d7dc033e 100644
--- a/doc/source/whatsnew/v0.18.0.txt
+++ b/doc/source/whatsnew/v0.18.0.txt
@@ -543,3 +543,5 @@ Bug Fixes
 - Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`)
 
 - Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`)
+
+- Bug in ``buffer_rd_bytes`` src->buffer could be freed more than once if reading failed, causing a segfault (:issue:`12098`) 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 11ccb0eba8f72..0797354c1a92e 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -3667,6 +3667,25 @@ def test_buffer_overflow(self):
                 self.assertIn(
                     'Buffer overflow caught - possible malformed input file.', str(cperr))
 
+    def test_buffer_rd_bytes(self):
+        # GH 12098
+        # src->buffer can be freed twice leading to a segfault if a corrupt 
+        # gzip file is read with read_csv and the buffer is filled more than
+        # once before gzip throws an exception
+        
+        data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
+               '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
+               '\xA6\x4D' + '\x55' * 267 + \
+               '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
+               '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
+        for i in range(100):
+            try:
+                _ = self.read_csv(StringIO(data),
+                                  compression='gzip',
+                                  delim_whitespace=True)
+            except Exception as e:
+                pass
+
     def test_single_char_leading_whitespace(self):
         # GH 9710
         data = """\
@@ -4208,6 +4227,25 @@ def test_buffer_overflow(self):
                 self.assertIn(
                     'Buffer overflow caught - possible malformed input file.', str(cperr))
 
+    def test_buffer_rd_bytes(self):
+        # GH 12098
+        # src->buffer can be freed twice leading to a segfault if a corrupt 
+        # gzip file is read with read_csv and the buffer is filled more than
+        # once before gzip throws an exception
+        
+        data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
+               '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
+               '\xA6\x4D' + '\x55' * 267 + \
+               '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
+               '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
+        for i in range(100):
+            try:
+                _ = self.read_csv(StringIO(data),
+                                  compression='gzip',
+                                  delim_whitespace=True)
+            except Exception as e:
+                pass
+
     def test_single_char_leading_whitespace(self):
         # GH 9710
         data = """\
diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c
index 0297d1ba49527..566de72804968 100644
--- a/pandas/src/parser/io.c
+++ b/pandas/src/parser/io.c
@@ -121,6 +121,7 @@ void* buffer_rd_bytes(void *source, size_t nbytes,
 
     /* delete old object */
     Py_XDECREF(src->buffer);
+    src->buffer = NULL;
     args = Py_BuildValue("(i)", nbytes);
 
     func = PyObject_GetAttrString(src->obj, "read");