13
13
# limitations under the License.
14
14
15
15
import datetime as dt
16
+ import json
16
17
import math
17
18
import re
18
19
import tempfile
19
20
20
21
import db_dtypes # type: ignore
21
22
import geopandas as gpd # type: ignore
23
+ import google .api_core .exceptions
22
24
import numpy
23
25
from packaging .version import Version
24
26
import pandas as pd
@@ -3474,9 +3476,11 @@ def foo(x):
3474
3476
("int64_col" , pd .ArrowDtype (pa .timestamp ("us" ))),
3475
3477
("int64_col" , pd .ArrowDtype (pa .timestamp ("us" , tz = "UTC" ))),
3476
3478
("int64_col" , "time64[us][pyarrow]" ),
3479
+ ("int64_col" , pd .ArrowDtype (db_dtypes .JSONArrowType ())),
3477
3480
("bool_col" , "Int64" ),
3478
3481
("bool_col" , "string[pyarrow]" ),
3479
3482
("bool_col" , "Float64" ),
3483
+ ("bool_col" , pd .ArrowDtype (db_dtypes .JSONArrowType ())),
3480
3484
("string_col" , "binary[pyarrow]" ),
3481
3485
("bytes_col" , "string[pyarrow]" ),
3482
3486
# pandas actually doesn't let folks convert to/from naive timestamp and
@@ -3541,7 +3545,7 @@ def test_astype_safe(session):
3541
3545
pd .testing .assert_series_equal (result , exepcted )
3542
3546
3543
3547
3544
- def test_series_astype_error_error (session ):
3548
+ def test_series_astype_w_invalid_error (session ):
3545
3549
input = pd .Series (["hello" , "world" , "3.11" , "4000" ])
3546
3550
with pytest .raises (ValueError ):
3547
3551
session .read_pandas (input ).astype ("Float64" , errors = "bad_value" )
@@ -3676,6 +3680,118 @@ def test_timestamp_astype_string():
3676
3680
assert bf_result .dtype == "string[pyarrow]"
3677
3681
3678
3682
3683
+ @pytest .mark .parametrize ("errors" , ["raise" , "null" ])
3684
+ def test_float_astype_json (errors ):
3685
+ data = ["1.25" , "2500000000" , None , "-12323.24" ]
3686
+ bf_series = series .Series (data , dtype = dtypes .FLOAT_DTYPE )
3687
+
3688
+ bf_result = bf_series .astype (dtypes .JSON_DTYPE , errors = errors )
3689
+ assert bf_result .dtype == dtypes .JSON_DTYPE
3690
+
3691
+ pd_result = bf_series .to_pandas ().astype (dtypes .JSON_DTYPE )
3692
+ pd .testing .assert_series_equal (bf_result .to_pandas (), pd_result )
3693
+
3694
+
3695
+ @pytest .mark .parametrize ("errors" , ["raise" , "null" ])
3696
+ def test_string_astype_json (errors ):
3697
+ data = [
3698
+ "1" ,
3699
+ None ,
3700
+ '["1","3","5"]' ,
3701
+ '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}' ,
3702
+ ]
3703
+ bf_series = series .Series (data , dtype = dtypes .STRING_DTYPE )
3704
+
3705
+ bf_result = bf_series .astype (dtypes .JSON_DTYPE , errors = errors )
3706
+ assert bf_result .dtype == dtypes .JSON_DTYPE
3707
+
3708
+ pd_result = bf_series .to_pandas ().astype (dtypes .JSON_DTYPE )
3709
+ pd .testing .assert_series_equal (bf_result .to_pandas (), pd_result )
3710
+
3711
+
3712
+ def test_string_astype_json_in_safe_mode ():
3713
+ data = ["this is not a valid json string" ]
3714
+ bf_series = series .Series (data , dtype = dtypes .STRING_DTYPE )
3715
+ bf_result = bf_series .astype (dtypes .JSON_DTYPE , errors = "null" )
3716
+ assert bf_result .dtype == dtypes .JSON_DTYPE
3717
+
3718
+ expected = pd .Series ([None ], dtype = dtypes .JSON_DTYPE )
3719
+ expected .index = expected .index .astype ("Int64" )
3720
+ pd .testing .assert_series_equal (bf_result .to_pandas (), expected )
3721
+
3722
+
3723
+ def test_string_astype_json_raise_error ():
3724
+ data = ["this is not a valid json string" ]
3725
+ bf_series = series .Series (data , dtype = dtypes .STRING_DTYPE )
3726
+ with pytest .raises (
3727
+ google .api_core .exceptions .BadRequest ,
3728
+ match = "syntax error while parsing value" ,
3729
+ ):
3730
+ bf_series .astype (dtypes .JSON_DTYPE , errors = "raise" ).to_pandas ()
3731
+
3732
+
3733
+ @pytest .mark .parametrize ("errors" , ["raise" , "null" ])
3734
+ @pytest .mark .parametrize (
3735
+ ("data" , "to_type" ),
3736
+ [
3737
+ pytest .param (["1" , "10.0" , None ], dtypes .INT_DTYPE , id = "to_int" ),
3738
+ pytest .param (["0.0001" , "2500000000" , None ], dtypes .FLOAT_DTYPE , id = "to_float" ),
3739
+ pytest .param (["true" , "false" , None ], dtypes .BOOL_DTYPE , id = "to_bool" ),
3740
+ pytest .param (['"str"' , None ], dtypes .STRING_DTYPE , id = "to_string" ),
3741
+ pytest .param (
3742
+ ['"str"' , None ],
3743
+ dtypes .TIME_DTYPE ,
3744
+ id = "invalid" ,
3745
+ marks = pytest .mark .xfail (raises = TypeError ),
3746
+ ),
3747
+ ],
3748
+ )
3749
+ def test_json_astype_others (data , to_type , errors ):
3750
+ bf_series = series .Series (data , dtype = dtypes .JSON_DTYPE )
3751
+
3752
+ bf_result = bf_series .astype (to_type , errors = errors )
3753
+ assert bf_result .dtype == to_type
3754
+
3755
+ load_data = [json .loads (item ) if item is not None else None for item in data ]
3756
+ expected = pd .Series (load_data , dtype = to_type )
3757
+ expected .index = expected .index .astype ("Int64" )
3758
+ pd .testing .assert_series_equal (bf_result .to_pandas (), expected )
3759
+
3760
+
3761
+ @pytest .mark .parametrize (
3762
+ ("data" , "to_type" ),
3763
+ [
3764
+ pytest .param (["10.2" , None ], dtypes .INT_DTYPE , id = "to_int" ),
3765
+ pytest .param (["false" , None ], dtypes .FLOAT_DTYPE , id = "to_float" ),
3766
+ pytest .param (["10.2" , None ], dtypes .BOOL_DTYPE , id = "to_bool" ),
3767
+ pytest .param (["true" , None ], dtypes .STRING_DTYPE , id = "to_string" ),
3768
+ ],
3769
+ )
3770
+ def test_json_astype_others_raise_error (data , to_type ):
3771
+ bf_series = series .Series (data , dtype = dtypes .JSON_DTYPE )
3772
+ with pytest .raises (google .api_core .exceptions .BadRequest ):
3773
+ bf_series .astype (to_type , errors = "raise" ).to_pandas ()
3774
+
3775
+
3776
+ @pytest .mark .parametrize (
3777
+ ("data" , "to_type" ),
3778
+ [
3779
+ pytest .param (["10.2" , None ], dtypes .INT_DTYPE , id = "to_int" ),
3780
+ pytest .param (["false" , None ], dtypes .FLOAT_DTYPE , id = "to_float" ),
3781
+ pytest .param (["10.2" , None ], dtypes .BOOL_DTYPE , id = "to_bool" ),
3782
+ pytest .param (["true" , None ], dtypes .STRING_DTYPE , id = "to_string" ),
3783
+ ],
3784
+ )
3785
+ def test_json_astype_others_in_safe_mode (data , to_type ):
3786
+ bf_series = series .Series (data , dtype = dtypes .JSON_DTYPE )
3787
+ bf_result = bf_series .astype (to_type , errors = "null" )
3788
+ assert bf_result .dtype == to_type
3789
+
3790
+ expected = pd .Series ([None , None ], dtype = to_type )
3791
+ expected .index = expected .index .astype ("Int64" )
3792
+ pd .testing .assert_series_equal (bf_result .to_pandas (), expected )
3793
+
3794
+
3679
3795
@pytest .mark .parametrize (
3680
3796
"index" ,
3681
3797
[0 , 5 , - 2 ],
@@ -3687,9 +3803,7 @@ def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index):
3687
3803
assert bf_result == pd_result
3688
3804
3689
3805
3690
- def test_iloc_single_integer_out_of_bound_error (
3691
- scalars_df_index , scalars_pandas_df_index
3692
- ):
3806
+ def test_iloc_single_integer_out_of_bound_error (scalars_df_index ):
3693
3807
with pytest .raises (IndexError , match = "single positional indexer is out-of-bounds" ):
3694
3808
scalars_df_index .string_col .iloc [99 ]
3695
3809
0 commit comments