Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ I/O
^^^

- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.

- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)

Plotting
^^^^^^^^
Expand Down
30 changes: 19 additions & 11 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,17 @@ cdef class TextReader:
float_precision=None,
skip_blank_lines=True):

# encoding
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know that you copied and pasted this, but let's take this opportunity to provide a much-more informative comment about this whole block of logic (a sentence is sufficient).

if encoding is not None:
if not isinstance(encoding, bytes):
encoding = encoding.encode('utf-8')
encoding = encoding.lower()
self.c_encoding = <char*> encoding
else:
self.c_encoding = NULL

self.encoding = encoding

self.parser = parser_new()
self.parser.chunksize = tokenize_chunksize

Expand Down Expand Up @@ -495,17 +506,6 @@ cdef class TextReader:
self.parser.double_converter_nogil = NULL
self.parser.double_converter_withgil = round_trip

# encoding
if encoding is not None:
if not isinstance(encoding, bytes):
encoding = encoding.encode('utf-8')
encoding = encoding.lower()
self.c_encoding = <char*> encoding
else:
self.c_encoding = NULL

self.encoding = encoding

if isinstance(dtype, dict):
dtype = {k: pandas_dtype(dtype[k])
for k in dtype}
Expand Down Expand Up @@ -684,6 +684,14 @@ cdef class TextReader:
else:
raise ValueError('Unrecognized compression type: %s' %
self.compression)

if b'utf-16' in (self.encoding or b''):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment here on what is going on

# we need to read utf-16 through UTF8Recoder.
# if source is utf-16, convert source to utf-8 by UTF8Recoder.
source = com.UTF8Recoder(source, self.encoding.decode('utf-8'))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a short 'why' we are doing this?

self.encoding = b'utf-8'
self.c_encoding = <char*> self.encoding

self.handle = source

if isinstance(source, basestring):
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1671,7 +1671,9 @@ def __init__(self, src, **kwds):

ParserBase.__init__(self, kwds)

if 'utf-16' in (kwds.get('encoding') or ''):
if (kwds.get('compression') is None
and 'utf-16' in (kwds.get('encoding') or '')):
# if source is utf-16 plain text, convert source to utf-8
if isinstance(src, compat.string_types):
src = open(src, 'rb')
self.handles.append(src)
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/io/parser/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pytest

import pandas as pd
import pandas.util.testing as tm


Expand Down Expand Up @@ -157,6 +158,19 @@ def test_read_csv_infer_compression(self):

inputs[3].close()

def test_read_csv_compressed_utf16_example(self):
# GH18071
path = tm.get_data_path('utf16_ex_small.zip')

result = self.read_csv(path, encoding='utf-16',
compression='zip', sep='\t')
expected = pd.DataFrame({
u'Country': [u'Venezuela', u'Venezuela'],
u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.']
})

tm.assert_frame_equal(result, expected)

def test_invalid_compression(self):
msg = 'Unrecognized compression type: sfark'
with tm.assert_raises_regex(ValueError, msg):
Expand Down
Binary file added pandas/tests/io/parser/data/utf16_ex_small.zip
Binary file not shown.