@@ -45,6 +45,11 @@ def test_get_dummies_raises_on_dtype_object(self, df):
45
45
with pytest .raises (ValueError , match = msg ):
46
46
get_dummies (df , dtype = "object" )
47
47
48
+ def test_get_dummies_warns_default_dtype (self , df ):
49
+ msg = "The default dtype will change from 'uint8' to 'bool'"
50
+ with pytest .warns (FutureWarning , match = msg ):
51
+ get_dummies (df )
52
+
48
53
def test_get_dummies_basic (self , sparse , dtype ):
49
54
s_list = list ("abc" )
50
55
s_series = Series (s_list )
@@ -121,9 +126,11 @@ def test_get_dummies_just_na(self, sparse):
121
126
just_na_series = Series (just_na_list )
122
127
just_na_series_index = Series (just_na_list , index = ["A" ])
123
128
124
- res_list = get_dummies (just_na_list , sparse = sparse )
125
- res_series = get_dummies (just_na_series , sparse = sparse )
126
- res_series_index = get_dummies (just_na_series_index , sparse = sparse )
129
+ res_list = get_dummies (just_na_list , dtype = np .uint8 , sparse = sparse )
130
+ res_series = get_dummies (just_na_series , dtype = np .uint8 , sparse = sparse )
131
+ res_series_index = get_dummies (
132
+ just_na_series_index , dtype = np .uint8 , sparse = sparse
133
+ )
127
134
128
135
assert res_list .empty
129
136
assert res_series .empty
@@ -169,7 +176,7 @@ def test_get_dummies_unicode(self, sparse):
169
176
e = "e"
170
177
eacute = unicodedata .lookup ("LATIN SMALL LETTER E WITH ACUTE" )
171
178
s = [e , eacute , eacute ]
172
- res = get_dummies (s , prefix = "letter" , sparse = sparse )
179
+ res = get_dummies (s , dtype = np . uint8 , prefix = "letter" , sparse = sparse )
173
180
exp = DataFrame (
174
181
{"letter_e" : [1 , 0 , 0 ], f"letter_{ eacute } " : [0 , 1 , 1 ]}, dtype = np .uint8
175
182
)
@@ -179,7 +186,7 @@ def test_get_dummies_unicode(self, sparse):
179
186
180
187
def test_dataframe_dummies_all_obj (self , df , sparse ):
181
188
df = df [["A" , "B" ]]
182
- result = get_dummies (df , sparse = sparse )
189
+ result = get_dummies (df , dtype = np . uint8 , sparse = sparse )
183
190
expected = DataFrame (
184
191
{"A_a" : [1 , 0 , 1 ], "A_b" : [0 , 1 , 0 ], "B_b" : [1 , 1 , 0 ], "B_c" : [0 , 0 , 1 ]},
185
192
dtype = np .uint8 ,
@@ -200,7 +207,7 @@ def test_dataframe_dummies_string_dtype(self, df):
200
207
# GH44965
201
208
df = df [["A" , "B" ]]
202
209
df = df .astype ({"A" : "object" , "B" : "string" })
203
- result = get_dummies (df )
210
+ result = get_dummies (df , dtype = np . uint8 )
204
211
expected = DataFrame (
205
212
{
206
213
"A_a" : [1 , 0 , 1 ],
@@ -234,7 +241,7 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
234
241
235
242
def test_dataframe_dummies_prefix_list (self , df , sparse ):
236
243
prefixes = ["from_A" , "from_B" ]
237
- result = get_dummies (df , prefix = prefixes , sparse = sparse )
244
+ result = get_dummies (df , dtype = np . uint8 , prefix = prefixes , sparse = sparse )
238
245
expected = DataFrame (
239
246
{
240
247
"C" : [1 , 2 , 3 ],
@@ -255,7 +262,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):
255
262
256
263
def test_dataframe_dummies_prefix_str (self , df , sparse ):
257
264
# not that you should do this...
258
- result = get_dummies (df , prefix = "bad" , sparse = sparse )
265
+ result = get_dummies (df , dtype = np . uint8 , prefix = "bad" , sparse = sparse )
259
266
bad_columns = ["bad_a" , "bad_b" , "bad_b" , "bad_c" ]
260
267
expected = DataFrame (
261
268
[[1 , 1 , 0 , 1 , 0 ], [2 , 0 , 1 , 1 , 0 ], [3 , 1 , 0 , 0 , 1 ]],
@@ -280,7 +287,9 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
280
287
tm .assert_frame_equal (result , expected )
281
288
282
289
def test_dataframe_dummies_subset (self , df , sparse ):
283
- result = get_dummies (df , prefix = ["from_A" ], columns = ["A" ], sparse = sparse )
290
+ result = get_dummies (
291
+ df , dtype = np .uint8 , prefix = ["from_A" ], columns = ["A" ], sparse = sparse
292
+ )
284
293
expected = DataFrame (
285
294
{
286
295
"B" : ["b" , "b" , "c" ],
@@ -298,7 +307,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
298
307
tm .assert_frame_equal (result , expected )
299
308
300
309
def test_dataframe_dummies_prefix_sep (self , df , sparse ):
301
- result = get_dummies (df , prefix_sep = ".." , sparse = sparse )
310
+ result = get_dummies (df , dtype = np . uint8 , prefix_sep = ".." , sparse = sparse )
302
311
expected = DataFrame (
303
312
{
304
313
"C" : [1 , 2 , 3 ],
@@ -317,11 +326,13 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):
317
326
318
327
tm .assert_frame_equal (result , expected )
319
328
320
- result = get_dummies (df , prefix_sep = [".." , "__" ], sparse = sparse )
329
+ result = get_dummies (df , dtype = np . uint8 , prefix_sep = [".." , "__" ], sparse = sparse )
321
330
expected = expected .rename (columns = {"B..b" : "B__b" , "B..c" : "B__c" })
322
331
tm .assert_frame_equal (result , expected )
323
332
324
- result = get_dummies (df , prefix_sep = {"A" : ".." , "B" : "__" }, sparse = sparse )
333
+ result = get_dummies (
334
+ df , dtype = np .uint8 , prefix_sep = {"A" : ".." , "B" : "__" }, sparse = sparse
335
+ )
325
336
tm .assert_frame_equal (result , expected )
326
337
327
338
def test_dataframe_dummies_prefix_bad_length (self , df , sparse ):
@@ -330,20 +341,20 @@ def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
330
341
"encoded (2)"
331
342
)
332
343
with pytest .raises (ValueError , match = msg ):
333
- get_dummies (df , prefix = ["too few" ], sparse = sparse )
344
+ get_dummies (df , dtype = np . uint8 , prefix = ["too few" ], sparse = sparse )
334
345
335
346
def test_dataframe_dummies_prefix_sep_bad_length (self , df , sparse ):
336
347
msg = re .escape (
337
348
"Length of 'prefix_sep' (1) did not match the length of the columns being "
338
349
"encoded (2)"
339
350
)
340
351
with pytest .raises (ValueError , match = msg ):
341
- get_dummies (df , prefix_sep = ["bad" ], sparse = sparse )
352
+ get_dummies (df , dtype = np . uint8 , prefix_sep = ["bad" ], sparse = sparse )
342
353
343
354
def test_dataframe_dummies_prefix_dict (self , sparse ):
344
355
prefixes = {"A" : "from_A" , "B" : "from_B" }
345
356
df = DataFrame ({"C" : [1 , 2 , 3 ], "A" : ["a" , "b" , "a" ], "B" : ["b" , "b" , "c" ]})
346
- result = get_dummies (df , prefix = prefixes , sparse = sparse )
357
+ result = get_dummies (df , dtype = np . uint8 , prefix = prefixes , sparse = sparse )
347
358
348
359
expected = DataFrame (
349
360
{
@@ -453,16 +464,18 @@ def test_get_dummies_basic_drop_first(self, sparse):
453
464
454
465
expected = DataFrame ({"b" : [0 , 1 , 0 ], "c" : [0 , 0 , 1 ]}, dtype = np .uint8 )
455
466
456
- result = get_dummies (s_list , drop_first = True , sparse = sparse )
467
+ result = get_dummies (s_list , dtype = np . uint8 , drop_first = True , sparse = sparse )
457
468
if sparse :
458
469
expected = expected .apply (SparseArray , fill_value = 0 )
459
470
tm .assert_frame_equal (result , expected )
460
471
461
- result = get_dummies (s_series , drop_first = True , sparse = sparse )
472
+ result = get_dummies (s_series , dtype = np . uint8 , drop_first = True , sparse = sparse )
462
473
tm .assert_frame_equal (result , expected )
463
474
464
475
expected .index = list ("ABC" )
465
- result = get_dummies (s_series_index , drop_first = True , sparse = sparse )
476
+ result = get_dummies (
477
+ s_series_index , dtype = np .uint8 , drop_first = True , sparse = sparse
478
+ )
466
479
tm .assert_frame_equal (result , expected )
467
480
468
481
def test_get_dummies_basic_drop_first_one_level (self , sparse ):
@@ -473,27 +486,31 @@ def test_get_dummies_basic_drop_first_one_level(self, sparse):
473
486
474
487
expected = DataFrame (index = np .arange (3 ))
475
488
476
- result = get_dummies (s_list , drop_first = True , sparse = sparse )
489
+ result = get_dummies (s_list , dtype = np . uint8 , drop_first = True , sparse = sparse )
477
490
tm .assert_frame_equal (result , expected )
478
491
479
- result = get_dummies (s_series , drop_first = True , sparse = sparse )
492
+ result = get_dummies (s_series , dtype = np . uint8 , drop_first = True , sparse = sparse )
480
493
tm .assert_frame_equal (result , expected )
481
494
482
495
expected = DataFrame (index = list ("ABC" ))
483
- result = get_dummies (s_series_index , drop_first = True , sparse = sparse )
496
+ result = get_dummies (
497
+ s_series_index , dtype = np .uint8 , drop_first = True , sparse = sparse
498
+ )
484
499
tm .assert_frame_equal (result , expected )
485
500
486
501
def test_get_dummies_basic_drop_first_NA (self , sparse ):
487
502
# Test NA handling together with drop_first
488
503
s_NA = ["a" , "b" , np .nan ]
489
- res = get_dummies (s_NA , drop_first = True , sparse = sparse )
504
+ res = get_dummies (s_NA , dtype = np . uint8 , drop_first = True , sparse = sparse )
490
505
exp = DataFrame ({"b" : [0 , 1 , 0 ]}, dtype = np .uint8 )
491
506
if sparse :
492
507
exp = exp .apply (SparseArray , fill_value = 0 )
493
508
494
509
tm .assert_frame_equal (res , exp )
495
510
496
- res_na = get_dummies (s_NA , dummy_na = True , drop_first = True , sparse = sparse )
511
+ res_na = get_dummies (
512
+ s_NA , dtype = np .uint8 , dummy_na = True , drop_first = True , sparse = sparse
513
+ )
497
514
exp_na = DataFrame ({"b" : [0 , 1 , 0 ], np .nan : [0 , 0 , 1 ]}, dtype = np .uint8 ).reindex (
498
515
["b" , np .nan ], axis = 1
499
516
)
@@ -502,22 +519,22 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
502
519
tm .assert_frame_equal (res_na , exp_na )
503
520
504
521
res_just_na = get_dummies (
505
- [np .nan ], dummy_na = True , drop_first = True , sparse = sparse
522
+ [np .nan ], dtype = np . uint8 , dummy_na = True , drop_first = True , sparse = sparse
506
523
)
507
524
exp_just_na = DataFrame (index = np .arange (1 ))
508
525
tm .assert_frame_equal (res_just_na , exp_just_na )
509
526
510
527
def test_dataframe_dummies_drop_first (self , df , sparse ):
511
528
df = df [["A" , "B" ]]
512
- result = get_dummies (df , drop_first = True , sparse = sparse )
529
+ result = get_dummies (df , dtype = np . uint8 , drop_first = True , sparse = sparse )
513
530
expected = DataFrame ({"A_b" : [0 , 1 , 0 ], "B_c" : [0 , 0 , 1 ]}, dtype = np .uint8 )
514
531
if sparse :
515
532
expected = expected .apply (SparseArray , fill_value = 0 )
516
533
tm .assert_frame_equal (result , expected )
517
534
518
535
def test_dataframe_dummies_drop_first_with_categorical (self , df , sparse , dtype ):
519
536
df ["cat" ] = Categorical (["x" , "y" , "y" ])
520
- result = get_dummies (df , drop_first = True , sparse = sparse )
537
+ result = get_dummies (df , dtype = np . uint8 , drop_first = True , sparse = sparse )
521
538
expected = DataFrame (
522
539
{"C" : [1 , 2 , 3 ], "A_b" : [0 , 1 , 0 ], "B_c" : [0 , 0 , 1 ], "cat_y" : [0 , 1 , 1 ]}
523
540
)
@@ -532,7 +549,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
532
549
def test_dataframe_dummies_drop_first_with_na (self , df , sparse ):
533
550
df .loc [3 , :] = [np .nan , np .nan , np .nan ]
534
551
result = get_dummies (
535
- df , dummy_na = True , drop_first = True , sparse = sparse
552
+ df , dtype = np . uint8 , dummy_na = True , drop_first = True , sparse = sparse
536
553
).sort_index (axis = 1 )
537
554
expected = DataFrame (
538
555
{
@@ -552,18 +569,20 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
552
569
553
570
tm .assert_frame_equal (result , expected )
554
571
555
- result = get_dummies (df , dummy_na = False , drop_first = True , sparse = sparse )
572
+ result = get_dummies (
573
+ df , dtype = np .uint8 , dummy_na = False , drop_first = True , sparse = sparse
574
+ )
556
575
expected = expected [["C" , "A_b" , "B_c" ]]
557
576
tm .assert_frame_equal (result , expected )
558
577
559
578
def test_get_dummies_int_int (self ):
560
579
data = Series ([1 , 2 , 1 ])
561
- result = get_dummies (data )
580
+ result = get_dummies (data , dtype = np . uint8 )
562
581
expected = DataFrame ([[1 , 0 ], [0 , 1 ], [1 , 0 ]], columns = [1 , 2 ], dtype = np .uint8 )
563
582
tm .assert_frame_equal (result , expected )
564
583
565
584
data = Series (Categorical (["a" , "b" , "a" ]))
566
- result = get_dummies (data )
585
+ result = get_dummies (data , dtype = np . uint8 )
567
586
expected = DataFrame (
568
587
[[1 , 0 ], [0 , 1 ], [1 , 0 ]], columns = Categorical (["a" , "b" ]), dtype = np .uint8
569
588
)
@@ -605,15 +624,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
605
624
def test_get_dummies_dont_sparsify_all_columns (self , sparse ):
606
625
# GH18914
607
626
df = DataFrame .from_dict ({"GDP" : [1 , 2 ], "Nation" : ["AB" , "CD" ]})
608
- df = get_dummies (df , columns = ["Nation" ], sparse = sparse )
627
+ df = get_dummies (df , dtype = np . uint8 , columns = ["Nation" ], sparse = sparse )
609
628
df2 = df .reindex (columns = ["GDP" ])
610
629
611
630
tm .assert_frame_equal (df [["GDP" ]], df2 )
612
631
613
632
def test_get_dummies_duplicate_columns (self , df ):
614
633
# GH20839
615
634
df .columns = ["A" , "A" , "A" ]
616
- result = get_dummies (df ).sort_index (axis = 1 )
635
+ result = get_dummies (df , dtype = np . uint8 ).sort_index (axis = 1 )
617
636
618
637
expected = DataFrame (
619
638
[[1 , 1 , 0 , 1 , 0 ], [2 , 0 , 1 , 1 , 0 ], [3 , 1 , 0 , 0 , 1 ]],
@@ -627,7 +646,7 @@ def test_get_dummies_duplicate_columns(self, df):
627
646
628
647
def test_get_dummies_all_sparse (self ):
629
648
df = DataFrame ({"A" : [1 , 2 ]})
630
- result = get_dummies (df , columns = ["A" ], sparse = True )
649
+ result = get_dummies (df , dtype = np . uint8 , columns = ["A" ], sparse = True )
631
650
dtype = SparseDtype ("uint8" , 0 )
632
651
expected = DataFrame (
633
652
{
@@ -652,4 +671,4 @@ def test_get_dummies_with_string_values(self, values):
652
671
msg = "Input must be a list-like for parameter `columns`"
653
672
654
673
with pytest .raises (TypeError , match = msg ):
655
- get_dummies (df , columns = values )
674
+ get_dummies (df , dtype = np . uint8 , columns = values )
0 commit comments