13
13
# limitations under the License.
14
14
15
15
import datetime as dt
16
+ import json
16
17
import math
17
18
import re
18
19
import tempfile
19
20
20
21
import db_dtypes # type: ignore
21
22
import geopandas as gpd # type: ignore
23
+ import google .api_core .exceptions
22
24
import numpy
23
25
from packaging .version import Version
24
26
import pandas as pd
@@ -3474,9 +3476,11 @@ def foo(x):
3474
3476
("int64_col" , pd .ArrowDtype (pa .timestamp ("us" ))),
3475
3477
("int64_col" , pd .ArrowDtype (pa .timestamp ("us" , tz = "UTC" ))),
3476
3478
("int64_col" , "time64[us][pyarrow]" ),
3479
+ ("int64_col" , pd .ArrowDtype (db_dtypes .JSONArrowType ())),
3477
3480
("bool_col" , "Int64" ),
3478
3481
("bool_col" , "string[pyarrow]" ),
3479
3482
("bool_col" , "Float64" ),
3483
+ ("bool_col" , pd .ArrowDtype (db_dtypes .JSONArrowType ())),
3480
3484
("string_col" , "binary[pyarrow]" ),
3481
3485
("bytes_col" , "string[pyarrow]" ),
3482
3486
# pandas actually doesn't let folks convert to/from naive timestamp and
@@ -3541,7 +3545,7 @@ def test_astype_safe(session):
3541
3545
pd .testing .assert_series_equal (result , exepcted )
3542
3546
3543
3547
3544
- def test_series_astype_error_error (session ):
3548
+ def test_series_astype_w_invalid_error (session ):
3545
3549
input = pd .Series (["hello" , "world" , "3.11" , "4000" ])
3546
3550
with pytest .raises (ValueError ):
3547
3551
session .read_pandas (input ).astype ("Float64" , errors = "bad_value" )
@@ -3676,6 +3680,119 @@ def test_timestamp_astype_string():
3676
3680
assert bf_result .dtype == "string[pyarrow]"
3677
3681
3678
3682
3683
+ @pytest .mark .parametrize ("errors" , ["raise" , "null" ])
3684
+ def test_float_astype_json (errors ):
3685
+ data = ["1.25" , "2500000000" , None , "-12323.24" ]
3686
+ bf_series = series .Series (data , dtype = dtypes .FLOAT_DTYPE )
3687
+
3688
+ bf_result = bf_series .astype (dtypes .JSON_DTYPE , errors = errors )
3689
+ assert bf_result .dtype == dtypes .JSON_DTYPE
3690
+
3691
+ expected_result = pd .Series (data , dtype = dtypes .JSON_DTYPE )
3692
+ expected_result .index = expected_result .index .astype ("Int64" )
3693
+ pd .testing .assert_series_equal (bf_result .to_pandas (), expected_result )
3694
+
3695
+
3696
+ @pytest .mark .parametrize ("errors" , ["raise" , "null" ])
3697
+ def test_string_astype_json (errors ):
3698
+ data = [
3699
+ "1" ,
3700
+ None ,
3701
+ '["1","3","5"]' ,
3702
+ '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}' ,
3703
+ ]
3704
+ bf_series = series .Series (data , dtype = dtypes .STRING_DTYPE )
3705
+
3706
+ bf_result = bf_series .astype (dtypes .JSON_DTYPE , errors = errors )
3707
+ assert bf_result .dtype == dtypes .JSON_DTYPE
3708
+
3709
+ pd_result = bf_series .to_pandas ().astype (dtypes .JSON_DTYPE )
3710
+ pd .testing .assert_series_equal (bf_result .to_pandas (), pd_result )
3711
+
3712
+
3713
+ def test_string_astype_json_in_safe_mode ():
3714
+ data = ["this is not a valid json string" ]
3715
+ bf_series = series .Series (data , dtype = dtypes .STRING_DTYPE )
3716
+ bf_result = bf_series .astype (dtypes .JSON_DTYPE , errors = "null" )
3717
+ assert bf_result .dtype == dtypes .JSON_DTYPE
3718
+
3719
+ expected = pd .Series ([None ], dtype = dtypes .JSON_DTYPE )
3720
+ expected .index = expected .index .astype ("Int64" )
3721
+ pd .testing .assert_series_equal (bf_result .to_pandas (), expected )
3722
+
3723
+
3724
+ def test_string_astype_json_raise_error ():
3725
+ data = ["this is not a valid json string" ]
3726
+ bf_series = series .Series (data , dtype = dtypes .STRING_DTYPE )
3727
+ with pytest .raises (
3728
+ google .api_core .exceptions .BadRequest ,
3729
+ match = "syntax error while parsing value" ,
3730
+ ):
3731
+ bf_series .astype (dtypes .JSON_DTYPE , errors = "raise" ).to_pandas ()
3732
+
3733
+
3734
+ @pytest .mark .parametrize ("errors" , ["raise" , "null" ])
3735
+ @pytest .mark .parametrize (
3736
+ ("data" , "to_type" ),
3737
+ [
3738
+ pytest .param (["1" , "10.0" , None ], dtypes .INT_DTYPE , id = "to_int" ),
3739
+ pytest .param (["0.0001" , "2500000000" , None ], dtypes .FLOAT_DTYPE , id = "to_float" ),
3740
+ pytest .param (["true" , "false" , None ], dtypes .BOOL_DTYPE , id = "to_bool" ),
3741
+ pytest .param (['"str"' , None ], dtypes .STRING_DTYPE , id = "to_string" ),
3742
+ pytest .param (
3743
+ ['"str"' , None ],
3744
+ dtypes .TIME_DTYPE ,
3745
+ id = "invalid" ,
3746
+ marks = pytest .mark .xfail (raises = TypeError ),
3747
+ ),
3748
+ ],
3749
+ )
3750
+ def test_json_astype_others (data , to_type , errors ):
3751
+ bf_series = series .Series (data , dtype = dtypes .JSON_DTYPE )
3752
+
3753
+ bf_result = bf_series .astype (to_type , errors = errors )
3754
+ assert bf_result .dtype == to_type
3755
+
3756
+ load_data = [json .loads (item ) if item is not None else None for item in data ]
3757
+ expected = pd .Series (load_data , dtype = to_type )
3758
+ expected .index = expected .index .astype ("Int64" )
3759
+ pd .testing .assert_series_equal (bf_result .to_pandas (), expected )
3760
+
3761
+
3762
+ @pytest .mark .parametrize (
3763
+ ("data" , "to_type" ),
3764
+ [
3765
+ pytest .param (["10.2" , None ], dtypes .INT_DTYPE , id = "to_int" ),
3766
+ pytest .param (["false" , None ], dtypes .FLOAT_DTYPE , id = "to_float" ),
3767
+ pytest .param (["10.2" , None ], dtypes .BOOL_DTYPE , id = "to_bool" ),
3768
+ pytest .param (["true" , None ], dtypes .STRING_DTYPE , id = "to_string" ),
3769
+ ],
3770
+ )
3771
+ def test_json_astype_others_raise_error (data , to_type ):
3772
+ bf_series = series .Series (data , dtype = dtypes .JSON_DTYPE )
3773
+ with pytest .raises (google .api_core .exceptions .BadRequest ):
3774
+ bf_series .astype (to_type , errors = "raise" ).to_pandas ()
3775
+
3776
+
3777
+ @pytest .mark .parametrize (
3778
+ ("data" , "to_type" ),
3779
+ [
3780
+ pytest .param (["10.2" , None ], dtypes .INT_DTYPE , id = "to_int" ),
3781
+ pytest .param (["false" , None ], dtypes .FLOAT_DTYPE , id = "to_float" ),
3782
+ pytest .param (["10.2" , None ], dtypes .BOOL_DTYPE , id = "to_bool" ),
3783
+ pytest .param (["true" , None ], dtypes .STRING_DTYPE , id = "to_string" ),
3784
+ ],
3785
+ )
3786
+ def test_json_astype_others_in_safe_mode (data , to_type ):
3787
+ bf_series = series .Series (data , dtype = dtypes .JSON_DTYPE )
3788
+ bf_result = bf_series .astype (to_type , errors = "null" )
3789
+ assert bf_result .dtype == to_type
3790
+
3791
+ expected = pd .Series ([None , None ], dtype = to_type )
3792
+ expected .index = expected .index .astype ("Int64" )
3793
+ pd .testing .assert_series_equal (bf_result .to_pandas (), expected )
3794
+
3795
+
3679
3796
@pytest .mark .parametrize (
3680
3797
"index" ,
3681
3798
[0 , 5 , - 2 ],
@@ -3687,9 +3804,7 @@ def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index):
3687
3804
assert bf_result == pd_result
3688
3805
3689
3806
3690
- def test_iloc_single_integer_out_of_bound_error (
3691
- scalars_df_index , scalars_pandas_df_index
3692
- ):
3807
+ def test_iloc_single_integer_out_of_bound_error (scalars_df_index ):
3693
3808
with pytest .raises (IndexError , match = "single positional indexer is out-of-bounds" ):
3694
3809
scalars_df_index .string_col .iloc [99 ]
3695
3810
0 commit comments