From fb204a232307d094ecb76789c0528be295361465 Mon Sep 17 00:00:00 2001 From: Scott E Lasley Date: Mon, 25 Jan 2016 12:06:11 -0500 Subject: [PATCH 1/3] BUG: parser buffer could be freed more than once if reading failed in buffer_rd_bytes, causing a segfault Closes #12098 --- doc/source/whatsnew/v0.18.0.txt | 2 ++ pandas/io/tests/test_parsers.py | 38 +++++++++++++++++++++++++++++++++ pandas/src/parser/io.c | 1 + 3 files changed, 41 insertions(+) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 81696982d0fde..48fb56213fa6a 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -545,3 +545,5 @@ of columns didn't match the number of series provided (:issue:`12039`). - Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`) - Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`) + +- Bug in ``buffer_rd_bytes`` src->buffer could be freed more than once if reading failed, causing a segfault (:issue:`12098`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 11ccb0eba8f72..0797354c1a92e 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3667,6 +3667,25 @@ def test_buffer_overflow(self): self.assertIn( 'Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_buffer_rd_bytes(self): + # GH 12098 + # src->buffer can be freed twice leading to a segfault if a corrupt + # gzip file is read with read_csv and the buffer is filled more than + # once before gzip throws an exception + + data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \ + '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \ + '\xA6\x4D' + '\x55' * 267 + \ + '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \ + '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO' + for i in range(100): + try: + _ = self.read_csv(StringIO(data), + compression='gzip', + delim_whitespace=True) + except Exception as e: + pass + def test_single_char_leading_whitespace(self): # GH 9710 data = """\ @@ -4208,6 +4227,25 @@ def test_buffer_overflow(self): self.assertIn( 'Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_buffer_rd_bytes(self): + # GH 12098 + # src->buffer can be freed twice leading to a segfault if a corrupt + # gzip file is read with read_csv and the buffer is filled more than + # once before gzip throws an exception + + data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \ + '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \ + '\xA6\x4D' + '\x55' * 267 + \ + '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \ + '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO' + for i in range(100): + try: + _ = self.read_csv(StringIO(data), + compression='gzip', + delim_whitespace=True) + except Exception as e: + pass + def test_single_char_leading_whitespace(self): # GH 9710 data = """\ diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c index 0297d1ba49527..566de72804968 100644 --- a/pandas/src/parser/io.c +++ b/pandas/src/parser/io.c @@ -121,6 +121,7 @@ void* buffer_rd_bytes(void *source, size_t nbytes, /* delete old object */ Py_XDECREF(src->buffer); + src->buffer = NULL; args = Py_BuildValue("(i)", nbytes); func = PyObject_GetAttrString(src->obj, "read"); From d77f0724b21cb290a0daec0202f2119543644236 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 26 Jan 2016 15:32:48 -0500 Subject: [PATCH 2/3] DOC: whatsnew edits --- doc/source/whatsnew/v0.18.0.txt | 60 ++++++++++++++++----------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 81696982d0fde..ccdc48bc1dbbb 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -201,10 +201,6 @@ In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available thru t s s.dt.round('D') -.. _whatsnew_0180.api: - -- ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`) - .. _whatsnew_0180.api_breaking: Backwards incompatible API changes @@ -319,29 +315,6 @@ other anchored offsets like ``MonthBegin`` and ``YearBegin``. d = pd.Timestamp('2014-02-15') d + pd.offsets.QuarterBegin(n=0, startingMonth=2) - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) - - .. ipython:: python - - s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) - s.between_time("7:00am", "9:00am") - - This will now raise. - - .. code-block:: python - - In [2]: s.between_time('20150101 07:00:00','20150101 09:00:00') - ValueError: Cannot convert arg ['20150101 07:00:00'] to a time. - -- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`) - -- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`) - - Changes to eval ^^^^^^^^^^^^^^^ @@ -397,6 +370,32 @@ assignments are valid for multi-line expressions. g = f / 2.0""", inplace=True) df + +.. _whatsnew_0180.api: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) + + .. ipython:: python + + s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) + s.between_time("7:00am", "9:00am") + + This will now raise. + + .. code-block:: python + + In [2]: s.between_time('20150101 07:00:00','20150101 09:00:00') + ValueError: Cannot convert arg ['20150101 07:00:00'] to a time. + +- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`) + +- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`) + +- ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`) + .. _whatsnew_0180.deprecations: Deprecations @@ -502,7 +501,7 @@ Bug Fixes - Bug in ``pd.read_clipboard`` and ``pd.to_clipboard`` functions not supporting Unicode; upgrade included ``pyperclip`` to v1.5.15 (:issue:`9263`) - Bug in ``DataFrame.query`` containing an assignment (:issue:`8664`) -- Bug in ``from_msgpack`` where ``__contains__()`` fails for columns of the unpacked ``DataFrame``, if the ``DataFrame`` has object columns. (:issue: `11880`) +- Bug in ``from_msgpack`` where ``__contains__()`` fails for columns of the unpacked ``DataFrame``, if the ``DataFrame`` has object columns. (:issue:`11880`) - Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`) @@ -521,7 +520,7 @@ Bug Fixes - Bug in ``Index`` prevents copying name of passed ``Index``, when a new name is not provided (:issue:`11193`) - Bug in ``read_excel`` failing to read any non-empty sheets when empty sheets exist and ``sheetname=None`` (:issue:`11711`) - Bug in ``read_excel`` failing to raise ``NotImplemented`` error when keywords ``parse_dates`` and ``date_parser`` are provided (:issue:`11544`) -- Bug in ``read_sql`` with pymysql connections failing to return chunked data (:issue:`11522`) +- Bug in ``read_sql`` with ``pymysql`` connections failing to return chunked data (:issue:`11522`) - Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`) - Bug in ``Int64Index`` and ``Float64Index`` preventing the use of the modulo operator (:issue:`9244`) @@ -529,8 +528,7 @@ Bug Fixes - Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`) -- Bug in ``.plot`` potentially modifying the ``colors`` input when the number -of columns didn't match the number of series provided (:issue:`12039`). +- Bug in ``.plot`` potentially modifying the ``colors`` input when the number of columns didn't match the number of series provided (:issue:`12039`). - Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`) From a1f0a79e059a807fcbdde3714152385e651a21c0 Mon Sep 17 00:00:00 2001 From: Scott E Lasley Date: Mon, 25 Jan 2016 12:06:11 -0500 Subject: [PATCH 3/3] BUG: parser buffer could be freed more than once if reading failed in buffer_rd_bytes, causing a segfault Closes #12098 --- doc/source/whatsnew/v0.18.0.txt | 2 ++ pandas/io/tests/test_parsers.py | 38 +++++++++++++++++++++++++++++++++ pandas/src/parser/io.c | 1 + 3 files changed, 41 insertions(+) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index ccdc48bc1dbbb..abca5d7dc033e 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -543,3 +543,5 @@ Bug Fixes - Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`) - Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`) + +- Bug in ``buffer_rd_bytes`` src->buffer could be freed more than once if reading failed, causing a segfault (:issue:`12098`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 11ccb0eba8f72..0797354c1a92e 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3667,6 +3667,25 @@ def test_buffer_overflow(self): self.assertIn( 'Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_buffer_rd_bytes(self): + # GH 12098 + # src->buffer can be freed twice leading to a segfault if a corrupt + # gzip file is read with read_csv and the buffer is filled more than + # once before gzip throws an exception + + data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \ + '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \ + '\xA6\x4D' + '\x55' * 267 + \ + '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \ + '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO' + for i in range(100): + try: + _ = self.read_csv(StringIO(data), + compression='gzip', + delim_whitespace=True) + except Exception as e: + pass + def test_single_char_leading_whitespace(self): # GH 9710 data = """\ @@ -4208,6 +4227,25 @@ def test_buffer_overflow(self): self.assertIn( 'Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_buffer_rd_bytes(self): + # GH 12098 + # src->buffer can be freed twice leading to a segfault if a corrupt + # gzip file is read with read_csv and the buffer is filled more than + # once before gzip throws an exception + + data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \ + '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \ + '\xA6\x4D' + '\x55' * 267 + \ + '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \ + '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO' + for i in range(100): + try: + _ = self.read_csv(StringIO(data), + compression='gzip', + delim_whitespace=True) + except Exception as e: + pass + def test_single_char_leading_whitespace(self): # GH 9710 data = """\ diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c index 0297d1ba49527..566de72804968 100644 --- a/pandas/src/parser/io.c +++ b/pandas/src/parser/io.c @@ -121,6 +121,7 @@ void* buffer_rd_bytes(void *source, size_t nbytes, /* delete old object */ Py_XDECREF(src->buffer); + src->buffer = NULL; args = Py_BuildValue("(i)", nbytes); func = PyObject_GetAttrString(src->obj, "read");