@@ -69,6 +69,51 @@ def setUp(self):
6969 self .csv2 = os .path .join (self .dirpath , 'test2.csv' )
7070 self .xls1 = os .path .join (self .dirpath , 'test.xls' )
7171
72+ def construct_dataframe (self , num_rows ):
73+
74+ df = DataFrame (np .random .rand (num_rows , 5 ), columns = list ('abcde' ))
75+ df ['foo' ] = 'foo'
76+ df ['bar' ] = 'bar'
77+ df ['baz' ] = 'baz'
78+ df ['date' ] = pd .date_range ('20000101 09:00:00' ,
79+ periods = num_rows ,
80+ freq = 's' )
81+ df ['int' ] = np .arange (num_rows , dtype = 'int64' )
82+ return df
83+
84+ def generate_multithread_dataframe (self , path , num_rows , num_tasks ):
85+
86+ def reader (arg ):
87+ start , nrows = arg
88+
89+ if not start :
90+ return pd .read_csv (path , index_col = 0 , header = 0 , nrows = nrows ,
91+ parse_dates = ['date' ])
92+
93+ return pd .read_csv (path ,
94+ index_col = 0 ,
95+ header = None ,
96+ skiprows = int (start ) + 1 ,
97+ nrows = nrows ,
98+ parse_dates = [9 ])
99+
100+ tasks = [
101+ (num_rows * i / num_tasks ,
102+ num_rows / num_tasks ) for i in range (num_tasks )
103+ ]
104+
105+ pool = ThreadPool (processes = num_tasks )
106+
107+ results = pool .map (reader , tasks )
108+
109+ header = results [0 ].columns
110+ for r in results [1 :]:
111+ r .columns = header
112+
113+ final_dataframe = pd .concat (results )
114+
115+ return final_dataframe
116+
72117 def test_converters_type_must_be_dict (self ):
73118 with tm .assertRaisesRegexp (TypeError , 'Type converters.+' ):
74119 self .read_csv (StringIO (self .data1 ), converters = 0 )
@@ -3361,8 +3406,43 @@ def test_variable_width_unicode(self):
33613406 tm .assert_frame_equal (expected , read_fwf (BytesIO (test .encode ('utf8' )),
33623407 header = None , encoding = 'utf8' ))
33633408
3409+ class CParserTests (ParserTests ):
3410+ """ base class for CParser Testsing """
3411+
3412+ def test_buffer_overflow (self ):
3413+ # GH9205
3414+ # test certain malformed input files that cause buffer overflows in
3415+ # tokenizer.c
3416+ malfw = "1\r 1\r 1\r 1\r 1\r " # buffer overflow in words pointer
3417+ malfs = "1\r 1\r 1\r 1\r 1\r 11\r " # buffer overflow in stream pointer
3418+ malfl = "1\r 1\r 1\r 1\r 1\r 11\r 1\r " # buffer overflow in lines pointer
3419+ for malf in (malfw , malfs , malfl ):
3420+ try :
3421+ df = self .read_table (StringIO (malf ))
3422+ except Exception as cperr :
3423+ self .assertIn (
3424+ 'Buffer overflow caught - possible malformed input file.' , str (cperr ))
3425+
3426+ def test_buffer_rd_bytes (self ):
3427+ # GH 12098
3428+ # src->buffer can be freed twice leading to a segfault if a corrupt
3429+ # gzip file is read with read_csv and the buffer is filled more than
3430+ # once before gzip throws an exception
3431+
3432+ data = '\x1F \x8B \x08 \x00 \x00 \x00 \x00 \x00 \x00 \x03 \xED \xC3 \x41 \x09 ' \
3433+ '\x00 \x00 \x08 \x00 \xB1 \xB7 \xB6 \xBA \xFE \xA5 \xCC \x21 \x6C \xB0 ' \
3434+ '\xA6 \x4D ' + '\x55 ' * 267 + \
3435+ '\x7D \xF7 \x00 \x91 \xE0 \x47 \x97 \x14 \x38 \x04 \x00 ' \
3436+ '\x1f \x8b \x08 \x00 VT\x97 V\x00 \x03 \xed ]\xef O'
3437+ for i in range (100 ):
3438+ try :
3439+ _ = self .read_csv (StringIO (data ),
3440+ compression = 'gzip' ,
3441+ delim_whitespace = True )
3442+ except Exception as e :
3443+ pass
33643444
3365- class TestCParserHighMemory (ParserTests , tm .TestCase ):
3445+ class TestCParserHighMemory (CParserTests , tm .TestCase ):
33663446
33673447 def read_csv (self , * args , ** kwds ):
33683448 kwds = kwds .copy ()
@@ -3653,39 +3733,6 @@ def test_fallback_to_python(self):
36533733 with tm .assertRaisesRegexp (ValueError , 'does not support' ):
36543734 self .read_table (StringIO (data ), engine = 'c' , skip_footer = 1 )
36553735
3656- def test_buffer_overflow (self ):
3657- # GH9205
3658- # test certain malformed input files that cause buffer overflows in
3659- # tokenizer.c
3660- malfw = "1\r 1\r 1\r 1\r 1\r " # buffer overflow in words pointer
3661- malfs = "1\r 1\r 1\r 1\r 1\r 11\r " # buffer overflow in stream pointer
3662- malfl = "1\r 1\r 1\r 1\r 1\r 11\r 1\r " # buffer overflow in lines pointer
3663- for malf in (malfw , malfs , malfl ):
3664- try :
3665- df = self .read_table (StringIO (malf ))
3666- except Exception as cperr :
3667- self .assertIn (
3668- 'Buffer overflow caught - possible malformed input file.' , str (cperr ))
3669-
3670- def test_buffer_rd_bytes (self ):
3671- # GH 12098
3672- # src->buffer can be freed twice leading to a segfault if a corrupt
3673- # gzip file is read with read_csv and the buffer is filled more than
3674- # once before gzip throws an exception
3675-
3676- data = '\x1F \x8B \x08 \x00 \x00 \x00 \x00 \x00 \x00 \x03 \xED \xC3 \x41 \x09 ' \
3677- '\x00 \x00 \x08 \x00 \xB1 \xB7 \xB6 \xBA \xFE \xA5 \xCC \x21 \x6C \xB0 ' \
3678- '\xA6 \x4D ' + '\x55 ' * 267 + \
3679- '\x7D \xF7 \x00 \x91 \xE0 \x47 \x97 \x14 \x38 \x04 \x00 ' \
3680- '\x1f \x8b \x08 \x00 VT\x97 V\x00 \x03 \xed ]\xef O'
3681- for i in range (100 ):
3682- try :
3683- _ = self .read_csv (StringIO (data ),
3684- compression = 'gzip' ,
3685- delim_whitespace = True )
3686- except Exception as e :
3687- pass
3688-
36893736 def test_single_char_leading_whitespace (self ):
36903737 # GH 9710
36913738 data = """\
@@ -3706,7 +3753,7 @@ def test_single_char_leading_whitespace(self):
37063753 tm .assert_frame_equal (result , expected )
37073754
37083755
3709- class TestCParserLowMemory (ParserTests , tm .TestCase ):
3756+ class TestCParserLowMemory (CParserTests , tm .TestCase ):
37103757
37113758 def read_csv (self , * args , ** kwds ):
37123759 kwds = kwds .copy ()
@@ -4213,39 +4260,6 @@ def test_raise_on_sep_with_delim_whitespace(self):
42134260 with tm .assertRaisesRegexp (ValueError , 'you can only specify one' ):
42144261 self .read_table (StringIO (data ), sep = '\s' , delim_whitespace = True )
42154262
4216- def test_buffer_overflow (self ):
4217- # GH9205
4218- # test certain malformed input files that cause buffer overflows in
4219- # tokenizer.c
4220- malfw = "1\r 1\r 1\r 1\r 1\r " # buffer overflow in words pointer
4221- malfs = "1\r 1\r 1\r 1\r 1\r 11\r " # buffer overflow in stream pointer
4222- malfl = "1\r 1\r 1\r 1\r 1\r 11\r 1\r " # buffer overflow in lines pointer
4223- for malf in (malfw , malfs , malfl ):
4224- try :
4225- df = self .read_table (StringIO (malf ))
4226- except Exception as cperr :
4227- self .assertIn (
4228- 'Buffer overflow caught - possible malformed input file.' , str (cperr ))
4229-
4230- def test_buffer_rd_bytes (self ):
4231- # GH 12098
4232- # src->buffer can be freed twice leading to a segfault if a corrupt
4233- # gzip file is read with read_csv and the buffer is filled more than
4234- # once before gzip throws an exception
4235-
4236- data = '\x1F \x8B \x08 \x00 \x00 \x00 \x00 \x00 \x00 \x03 \xED \xC3 \x41 \x09 ' \
4237- '\x00 \x00 \x08 \x00 \xB1 \xB7 \xB6 \xBA \xFE \xA5 \xCC \x21 \x6C \xB0 ' \
4238- '\xA6 \x4D ' + '\x55 ' * 267 + \
4239- '\x7D \xF7 \x00 \x91 \xE0 \x47 \x97 \x14 \x38 \x04 \x00 ' \
4240- '\x1f \x8b \x08 \x00 VT\x97 V\x00 \x03 \xed ]\xef O'
4241- for i in range (100 ):
4242- try :
4243- _ = self .read_csv (StringIO (data ),
4244- compression = 'gzip' ,
4245- delim_whitespace = True )
4246- except Exception as e :
4247- pass
4248-
42494263 def test_single_char_leading_whitespace (self ):
42504264 # GH 9710
42514265 data = """\
@@ -4300,51 +4314,6 @@ def test_multithread_stringio_read_csv(self):
43004314 for result in results :
43014315 tm .assert_frame_equal (first_result , result )
43024316
4303- def construct_dataframe (self , num_rows ):
4304-
4305- df = DataFrame (np .random .rand (num_rows , 5 ), columns = list ('abcde' ))
4306- df ['foo' ] = 'foo'
4307- df ['bar' ] = 'bar'
4308- df ['baz' ] = 'baz'
4309- df ['date' ] = pd .date_range ('20000101 09:00:00' ,
4310- periods = num_rows ,
4311- freq = 's' )
4312- df ['int' ] = np .arange (num_rows , dtype = 'int64' )
4313- return df
4314-
4315- def generate_multithread_dataframe (self , path , num_rows , num_tasks ):
4316-
4317- def reader (arg ):
4318- start , nrows = arg
4319-
4320- if not start :
4321- return pd .read_csv (path , index_col = 0 , header = 0 , nrows = nrows ,
4322- parse_dates = ['date' ])
4323-
4324- return pd .read_csv (path ,
4325- index_col = 0 ,
4326- header = None ,
4327- skiprows = int (start ) + 1 ,
4328- nrows = nrows ,
4329- parse_dates = [9 ])
4330-
4331- tasks = [
4332- (num_rows * i / num_tasks ,
4333- num_rows / num_tasks ) for i in range (num_tasks )
4334- ]
4335-
4336- pool = ThreadPool (processes = num_tasks )
4337-
4338- results = pool .map (reader , tasks )
4339-
4340- header = results [0 ].columns
4341- for r in results [1 :]:
4342- r .columns = header
4343-
4344- final_dataframe = pd .concat (results )
4345-
4346- return final_dataframe
4347-
43484317 def test_multithread_path_multipart_read_csv (self ):
43494318 # GH 11786
43504319 num_tasks = 4
0 commit comments