@@ -317,14 +317,23 @@ def test_featurevalidator_get_columns_to_encode():
317
317
assert feature_types == ['numerical' , 'numerical' , 'categorical' , 'categorical' ]
318
318
319
319
320
- def test_featurevalidator_remove_nan_catcolumns ():
320
+ def feature_validator_remove_nan_catcolumns (df_train : pd .DataFrame , df_test : pd .DataFrame ,
321
+ ans_train : np .ndarray , ans_test : np .ndarray ) -> None :
322
+ validator = TabularFeatureValidator ()
323
+ validator .fit (df_train )
324
+ transformed_df_train = validator .transform (df_train )
325
+ transformed_df_test = validator .transform (df_test )
326
+
327
+ assert np .array_equal (transformed_df_train , ans_train )
328
+ assert np .array_equal (transformed_df_test , ans_test )
329
+
330
+
331
+ def test_feature_validator_remove_nan_catcolumns ():
321
332
"""
322
333
Make sure categorical columns that have only nan values are removed.
323
334
"""
324
- # First case, there exist null columns in the train set
325
- # and the same columns are not all null for the test set.
326
- validator = TabularFeatureValidator ()
327
-
335
+ # First case, there exist null columns (B and C) in the train set
336
+ # and a same column (C) are not all null for the test set.
328
337
df_train = pd .DataFrame (
329
338
[
330
339
{'A' : 1 , 'B' : np .nan , 'C' : np .nan },
@@ -333,6 +342,7 @@ def test_featurevalidator_remove_nan_catcolumns():
333
342
],
334
343
dtype = 'category' ,
335
344
)
345
+ ans_train = np .array ([[0 , 1 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
336
346
df_test = pd .DataFrame (
337
347
[
338
348
{'A' : np .nan , 'B' : np .nan , 'C' : 5 },
@@ -341,18 +351,11 @@ def test_featurevalidator_remove_nan_catcolumns():
341
351
],
342
352
dtype = 'category' ,
343
353
)
354
+ ans_test = np .array ([[1 , 0 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
355
+ feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
344
356
345
- validator .fit (df_train )
346
- transformed_df_train = validator .transform (df_train )
347
- transformed_df_test = validator .transform (df_test )
348
-
349
- assert np .array_equal (transformed_df_train , np .array ([[0 , 1 ], [1 , 0 ], [0 , 1 ]], dtype = float ))
350
- assert np .array_equal (transformed_df_test , np .array ([[1 , 0 ], [1 , 0 ], [0 , 1 ]], dtype = float ))
351
-
352
- # Second case, there exist null columns in the training set and the same
353
- # are null in the test set.
354
- validator = TabularFeatureValidator ()
355
-
357
+ # Second case, there exist null columns (B and C) in the training set and
358
+ # the same columns (B and C) are null in the test set.
356
359
df_train = pd .DataFrame (
357
360
[
358
361
{'A' : 1 , 'B' : np .nan , 'C' : np .nan },
@@ -361,6 +364,7 @@ def test_featurevalidator_remove_nan_catcolumns():
361
364
],
362
365
dtype = 'category' ,
363
366
)
367
+ ans_train = np .array ([[0 , 1 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
364
368
df_test = pd .DataFrame (
365
369
[
366
370
{'A' : np .nan , 'B' : np .nan , 'C' : np .nan },
@@ -369,40 +373,28 @@ def test_featurevalidator_remove_nan_catcolumns():
369
373
],
370
374
dtype = 'category' ,
371
375
)
376
+ ans_test = np .array ([[1 , 0 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
377
+ feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
372
378
373
- validator .fit (df_train )
374
- transformed_df_train = validator .transform (df_train )
375
- transformed_df_test = validator .transform (df_test )
376
-
377
- assert np .array_equal (transformed_df_train , np .array ([[0 , 1 ], [1 , 0 ], [0 , 1 ]], dtype = float ))
378
- assert np .array_equal (transformed_df_test , np .array ([[1 , 0 ], [1 , 0 ], [0 , 1 ]], dtype = float ))
379
-
380
- # Third case, there exist no null columns in the training set and a
381
- # few null columns exist in the test set.
382
- validator = TabularFeatureValidator ()
383
-
379
+ # Third case, there exist no null columns in the training set and
380
+ # null columns exist in the test set.
384
381
df_train = pd .DataFrame (
385
382
[
386
383
{'A' : 1 , 'B' : 1 },
387
384
{'A' : 2 , 'B' : 2 }
388
385
],
389
386
dtype = 'category' ,
390
387
)
388
+ ans_train = np .array ([[1 , 0 , 1 , 0 ], [0 , 1 , 0 , 1 ]], dtype = np .float64 )
391
389
df_test = pd .DataFrame (
392
390
[
393
391
{'A' : np .nan , 'B' : np .nan },
394
392
{'A' : np .nan , 'B' : np .nan }
395
393
],
396
394
dtype = 'category' ,
397
395
)
398
-
399
- validator .fit (df_train )
400
- transformed_df_train = validator .transform (df_train )
401
- transformed_df_test = validator .transform (df_test )
402
-
403
- assert np .array_equal (transformed_df_train , np .array ([[1 , 0 , 1 , 0 ], [0 , 1 , 0 , 1 ]], dtype = float ))
404
- assert np .array_equal (transformed_df_test , np .array ([[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ]], dtype = float ))
405
-
396
+ ans_test = np .array ([[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ]], dtype = np .float64 )
397
+ feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
406
398
407
399
def test_features_unsupported_calls_are_raised ():
408
400
"""
0 commit comments