Skip to content

fix: reloaded transformer .transform error #569

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions bigframes/ml/compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,17 @@ def camel_to_snake(name):
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()

output_names = []
for transform_col in bq_model._properties["transformColumns"]:
transform_col_dict = cast(dict, transform_col)
# pass the columns that are not transformed
if "transformSql" not in transform_col:
if "transformSql" not in transform_col_dict:
continue
transform_sql: str = cast(dict, transform_col)["transformSql"]
transform_sql: str = transform_col_dict["transformSql"]
if not transform_sql.startswith("ML."):
continue

output_names.append(transform_col_dict["name"])
found_transformer = False
for prefix in _BQML_TRANSFROM_TYPE_MAPPING:
if transform_sql.startswith(prefix):
Expand All @@ -141,7 +144,10 @@ def camel_to_snake(name):
f"Unsupported transformer type. {constants.FEEDBACK_LINK}"
)

return cls(transformers=transformers)
transformer = cls(transformers=transformers)
transformer._output_names = output_names

return transformer

def _merge(
self, bq_model: bigquery.Model
Expand All @@ -164,6 +170,7 @@ def _merge(
for feature_column in bq_model.feature_columns
]
) == sorted(columns):
transformer_0._output_names = self._output_names
return transformer_0

return self
Expand Down
23 changes: 23 additions & 0 deletions tests/system/large/ml/test_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,26 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id):
]
assert reloaded_transformer.transformers_ == expected
assert reloaded_transformer._bqml_model is not None

result = transformer.fit_transform(
new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]]
).to_pandas()

expected = pandas.DataFrame(
{
"onehotencoded_species": [
[{"index": 1, "value": 1.0}],
[{"index": 1, "value": 1.0}],
[{"index": 2, "value": 1.0}],
],
"standard_scaled_culmen_length_mm": [
1.313249,
-0.20198,
-1.111118,
],
"standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338],
},
index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
)

pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False)
129 changes: 116 additions & 13 deletions tests/system/small/ml/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
Expand Down Expand Up @@ -92,7 +92,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_save_load(new_penguins_df, dataset_id):
Expand All @@ -140,6 +140,22 @@ def test_standard_scaler_save_load(new_penguins_df, dataset_id):
assert isinstance(reloaded_transformer, preprocessing.StandardScaler)
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118],
"standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848],
"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod.
Expand Down Expand Up @@ -177,7 +193,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
Expand All @@ -201,7 +217,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -229,7 +245,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
Expand All @@ -244,6 +260,22 @@ def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler)
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494],
"max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766],
"max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
scaler = preprocessing.MinMaxScaler()
Expand All @@ -266,7 +298,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -295,7 +327,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -335,7 +367,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
Expand All @@ -350,6 +382,22 @@ def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler)
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.fit_transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0],
"min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625],
"min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df):
discretizer = preprocessing.KBinsDiscretizer(strategy="uniform")
Expand All @@ -372,7 +420,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_series_normalizes(
Expand All @@ -399,7 +447,7 @@ def test_k_bins_discretizer_series_normalizes(
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -434,7 +482,7 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_normalizes_different_params(
Expand Down Expand Up @@ -471,7 +519,7 @@ def test_k_bins_discretizer_normalizes_different_params(
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
Expand All @@ -488,6 +536,22 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
assert reloaded_transformer.strategy == transformer.strategy
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.fit_transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_4", "bin_2"],
"kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_5"],
"kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"],
},
dtype="string[pyarrow]",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_one_hot_encoder_default_params(new_penguins_df):
encoder = preprocessing.OneHotEncoder()
Expand Down Expand Up @@ -650,6 +714,29 @@ def test_one_hot_encoder_save_load(new_penguins_df, dataset_id):
assert reloaded_transformer.max_categories == transformer.max_categories
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.fit_transform(
new_penguins_df[["species", "sex"]]
).to_pandas()

expected = pd.DataFrame(
{
"onehotencoded_species": [
[{"index": 1, "value": 1.0}],
[{"index": 1, "value": 1.0}],
[{"index": 2, "value": 1.0}],
],
"onehotencoded_sex": [
[{"index": 2, "value": 1.0}],
[{"index": 1, "value": 1.0}],
[{"index": 1, "value": 1.0}],
],
},
dtype=ONE_HOT_ENCODED_DTYPE,
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected)


def test_label_encoder_default_params(new_penguins_df):
encoder = preprocessing.LabelEncoder()
Expand Down Expand Up @@ -792,5 +879,21 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):
assert reloaded_transformer.max_categories == transformer.max_categories
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.transform(new_penguins_df).to_pandas()

expected = pd.DataFrame(
{
"labelencoded_species": [
1,
1,
2,
],
},
dtype="Int64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected)


# TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.