From 8703eb4fe184b1987a3513fe87dd3316cc052117 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 1 Apr 2025 22:17:59 +0000 Subject: [PATCH 01/15] docs: add matrix_factorization snippets --- samples/snippets/mf_explicit_model_test.py | 17 +++++++++++++++++ samples/snippets/mf_implicit_model_test.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 samples/snippets/mf_explicit_model_test.py create mode 100644 samples/snippets/mf_implicit_model_test.py diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py new file mode 100644 index 0000000000..cac9dfb3a2 --- /dev/null +++ b/samples/snippets/mf_explicit_model_test.py @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_explicit_matrix_factorization(random_model_id: str) -> None: + pass diff --git a/samples/snippets/mf_implicit_model_test.py b/samples/snippets/mf_implicit_model_test.py new file mode 100644 index 0000000000..cac9dfb3a2 --- /dev/null +++ b/samples/snippets/mf_implicit_model_test.py @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_explicit_matrix_factorization(random_model_id: str) -> None: + pass From 5b71583f9d89fb586a97f83b35875c3ce3bf5a2b Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 2 Apr 2025 18:15:24 +0000 Subject: [PATCH 02/15] incomplete mf snippets --- samples/snippets/mf_explicit_model_test.py | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index cac9dfb3a2..aa4f3b73b3 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -14,4 +14,39 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: + your_model_id = random_model_id + # [START bigframes_dataframes_bqml_mf_create] + from bigframes.ml import decomposition + import bigframes.pandas as bpd + + # Load data from BigQuery + bq_df = bpd.read_gbq( + "bqml_tutorial.ratings", columns=("user_id", "item_id", "rating") + ) + + # Create the Matrix Factorization model + model = decomposition.MatrixFactorization( + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_id", + rating_col="rating", + l2_reg=9.83, + ) + model.fit(bq_df) + model.to_gbq( + your_model_id, replace=True # For example: "bqml_tutorial.mf_explicit" + ) + # [END bigframes_dataframes_bqml_mf_create] + # [START bigframes_dataframe_bqml_mf_evaluate] + import bigframes.pandas as bpd + + model.score(bq_df) + # [END bigframes_dataframe_bqml_mf_evaluate] + # [START bigframes_dataframe_bqml_mf_predict] + + # [END bigframes_dataframe_bqml_mf_predict] + # [START bigframes_dataframe_bqml_mf_recommend] + model.predict(bq_df) + # [END bigframes_dataframe_bqml_mf_recommend] pass From edd4cd568f6de26686aa9f48dd53c4271f5863c4 Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 2 Apr 2025 19:41:06 +0000 Subject: [PATCH 03/15] prep implicit --- samples/snippets/mf_explicit_model_test.py | 16 +++++----- samples/snippets/mf_implicit_model_test.py | 34 +++++++++++++++++++++- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index aa4f3b73b3..3ecaefe9a2 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -15,7 +15,7 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: your_model_id = random_model_id - # [START bigframes_dataframes_bqml_mf_create] + # [START bigframes_dataframes_bqml_mf_explicit_create] from bigframes.ml import decomposition import bigframes.pandas as bpd @@ -37,16 +37,16 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: model.to_gbq( your_model_id, replace=True # For example: "bqml_tutorial.mf_explicit" ) - # [END bigframes_dataframes_bqml_mf_create] - # [START bigframes_dataframe_bqml_mf_evaluate] + # [END bigframes_dataframes_bqml_mf_explicit_create] + # [START bigframes_dataframe_bqml_mf_explicit_evaluate] import bigframes.pandas as bpd model.score(bq_df) - # [END bigframes_dataframe_bqml_mf_evaluate] - # [START bigframes_dataframe_bqml_mf_predict] + # [END bigframes_dataframe_bqml_mf_explicit_evaluate] + # [START bigframes_dataframe_bqml_mf_explicit_predict] - # [END bigframes_dataframe_bqml_mf_predict] - # [START bigframes_dataframe_bqml_mf_recommend] + # [END bigframes_dataframe_bqml_mf_explicit_predict] + # [START bigframes_dataframe_bqml_mf_explicit_recommend] model.predict(bq_df) - # [END bigframes_dataframe_bqml_mf_recommend] + # [END bigframes_dataframe_bqml_mf_explicit_recommend] pass diff --git a/samples/snippets/mf_implicit_model_test.py b/samples/snippets/mf_implicit_model_test.py index cac9dfb3a2..54dc66c645 100644 --- a/samples/snippets/mf_implicit_model_test.py +++ b/samples/snippets/mf_implicit_model_test.py @@ -13,5 +13,37 @@ # limitations under the License. -def test_explicit_matrix_factorization(random_model_id: str) -> None: +def test_implicit_matrix_factorization(random_model_id: str) -> None: + # [START bigframes_dataframe_mf_implicit_data] + from bigframes.ml import decomposition + import bigframes.pandas as bpd + + # sample data must be created from joined data and then grouped and ordered + bq_df = bpd.read_gbq("bqml_tutorial.analytics_session_data") + print(bq_df.peek(5)) + # Expected output: + # + # [END bigframes_dataframe_mf_implicit_data] + # [START bigframes_dataframe_mf_implicit_model] + rating = 0.3 * (1 + (bq_df["session_duration"] - 57937) / 57937) + model = decomposition.MatrixFactorization( + num_factors=15, + feedback_type="implicit", + user_col="visitorId", + item_col="contentId", + rating_col=rating, + l2_reg=30, + ) + # condition of rating < 1 required before fitting model + + # [END bigframes_dataframe_mf_implicit_model] + # [START bigframes_dataframe_mf_implicit_evaluate] + model.fit(bq_df) + # [END bigframes_dataframe_mf_implicit_evaluate] + # [START bigframes_dataframe_mf_implicit_subset] + + # [END bigframes_dataframe_mf_implicit_subset] + # [START bigframes_dataframe_mf_implicit_recommend] + model.score() + # [END bigframes_dataframe_mf_implicit_recommend] pass From b898a76f5c2e0fe3e40ffc253d1f89b08d2017a2 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 1 May 2025 18:44:01 +0000 Subject: [PATCH 04/15] near complete tutorial --- samples/snippets/mf_explicit_model_test.py | 44 ++++++++++++++++++---- samples/snippets/mf_implicit_model_test.py | 2 - 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index 3ecaefe9a2..71c20db54f 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -39,14 +39,42 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: ) # [END bigframes_dataframes_bqml_mf_explicit_create] # [START bigframes_dataframe_bqml_mf_explicit_evaluate] - import bigframes.pandas as bpd - - model.score(bq_df) + # Evaluate the model using the score() function + model.score() + # Output: + # mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance + # 0.485403 0.395052 0.025515 0.390573 0.68343 0.68343 # [END bigframes_dataframe_bqml_mf_explicit_evaluate] - # [START bigframes_dataframe_bqml_mf_explicit_predict] + # [START bigframes_dataframe_bqml_mf_recommend_df] + subset = bq_df.head(6) + predicted = model.predict(subset) + print(predicted) + # Output: + # predicted_rating user_id item_id rating + # 0 4.206146 4354 968 4.0 + # 1 4.853099 3622 3521 5.0 + # 2 2.679067 5543 920 2.0 + # 3 4.323458 445 3175 5.0 + # 4 3.476911 5535 235 4.0 + # [END bigframes_dataframe_bqml_mf_explicit_recommend_df] + # [START bigframes_dataframe_bqml_mf_explicit_recommend_model] + # import bigframes.bigquery as bbq - # [END bigframes_dataframe_bqml_mf_explicit_predict] - # [START bigframes_dataframe_bqml_mf_explicit_recommend] - model.predict(bq_df) - # [END bigframes_dataframe_bqml_mf_explicit_recommend] + # TODO: implement right_index parameter for DataFrame.merge() + # # Load movie data from BigQuery + # movies = bpd.read_gbq("bqml_tutorial.movies") + # # Merge movie data with rating data + # merged_df = bpd.merge(predicted, movies, left_on='item_id', right_on='movie_id') + # # separate users from data to call struct on data + # users = merged_df[['user_id', 'item_id']] + # user_data = merged_df[['movie_title', 'genre', 'predicted_rating', 'movie_id']].set_index('movie_id') + # struct_data = bbq.struct(user_data).to_frame() + # # Merge data to groupby predicted_rating and sort + # merged_user = bpd.merge(users, struct_data, left_on='item_id', right_index=True).drop('item_id', axis=1) + # desc_pred = merged_user.sort_values(by='predicted_rating', ascending=False) + # grouped = desc_pred.groupby('predicted_rating') + # result = bbq.array_agg(grouped) + # result.head(5) + # Output: + # [END bigframes_dataframe_bqml_mf_explicit_recommend_model] pass diff --git a/samples/snippets/mf_implicit_model_test.py b/samples/snippets/mf_implicit_model_test.py index 54dc66c645..dc658ad417 100644 --- a/samples/snippets/mf_implicit_model_test.py +++ b/samples/snippets/mf_implicit_model_test.py @@ -41,9 +41,7 @@ def test_implicit_matrix_factorization(random_model_id: str) -> None: model.fit(bq_df) # [END bigframes_dataframe_mf_implicit_evaluate] # [START bigframes_dataframe_mf_implicit_subset] - # [END bigframes_dataframe_mf_implicit_subset] # [START bigframes_dataframe_mf_implicit_recommend] - model.score() # [END bigframes_dataframe_mf_implicit_recommend] pass From 33f446a1cf70ed6440538b20776bf5f972bb5437 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 1 May 2025 19:06:36 +0000 Subject: [PATCH 05/15] implicit create --- samples/snippets/mf_implicit_model_test.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/samples/snippets/mf_implicit_model_test.py b/samples/snippets/mf_implicit_model_test.py index dc658ad417..7203d5e591 100644 --- a/samples/snippets/mf_implicit_model_test.py +++ b/samples/snippets/mf_implicit_model_test.py @@ -25,20 +25,23 @@ def test_implicit_matrix_factorization(random_model_id: str) -> None: # # [END bigframes_dataframe_mf_implicit_data] # [START bigframes_dataframe_mf_implicit_model] - rating = 0.3 * (1 + (bq_df["session_duration"] - 57937) / 57937) + rating_calculation = 0.3 * (1 + (bq_df["session_duration"] - 57937) / 57937) + filtered_bq_df = bq_df[rating_calculation < 1].assign( + rating=rating_calculation[rating_calculation < 1] + ) model = decomposition.MatrixFactorization( num_factors=15, feedback_type="implicit", user_col="visitorId", item_col="contentId", - rating_col=rating, + rating_col="rating", l2_reg=30, ) - # condition of rating < 1 required before fitting model - + model.fit(filtered_bq_df) # [END bigframes_dataframe_mf_implicit_model] # [START bigframes_dataframe_mf_implicit_evaluate] - model.fit(bq_df) + model.score() + # Output: # [END bigframes_dataframe_mf_implicit_evaluate] # [START bigframes_dataframe_mf_implicit_subset] # [END bigframes_dataframe_mf_implicit_subset] From 431e9eb30187e938cc2fe0d0e81705f116907c31 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 1 May 2025 19:11:10 +0000 Subject: [PATCH 06/15] add doc note --- samples/snippets/mf_explicit_model_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index 71c20db54f..dec8f0ad01 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -46,7 +46,8 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # 0.485403 0.395052 0.025515 0.390573 0.68343 0.68343 # [END bigframes_dataframe_bqml_mf_explicit_evaluate] # [START bigframes_dataframe_bqml_mf_recommend_df] - subset = bq_df.head(6) + # Use predict() to get the predicted rating for each movie for 5 users + subset = bq_df.head(5) predicted = model.predict(subset) print(predicted) # Output: From 7380f6fac0b509644cf45d6a6a03dcd98e158cc6 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 5 May 2025 14:44:19 +0000 Subject: [PATCH 07/15] complete explicit tutorial --- samples/snippets/mf_explicit_model_test.py | 49 +++++++++++++++------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index dec8f0ad01..1493a7e9a8 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -61,21 +61,40 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # [START bigframes_dataframe_bqml_mf_explicit_recommend_model] # import bigframes.bigquery as bbq - # TODO: implement right_index parameter for DataFrame.merge() - # # Load movie data from BigQuery - # movies = bpd.read_gbq("bqml_tutorial.movies") - # # Merge movie data with rating data - # merged_df = bpd.merge(predicted, movies, left_on='item_id', right_on='movie_id') - # # separate users from data to call struct on data - # users = merged_df[['user_id', 'item_id']] - # user_data = merged_df[['movie_title', 'genre', 'predicted_rating', 'movie_id']].set_index('movie_id') - # struct_data = bbq.struct(user_data).to_frame() - # # Merge data to groupby predicted_rating and sort - # merged_user = bpd.merge(users, struct_data, left_on='item_id', right_index=True).drop('item_id', axis=1) - # desc_pred = merged_user.sort_values(by='predicted_rating', ascending=False) - # grouped = desc_pred.groupby('predicted_rating') - # result = bbq.array_agg(grouped) - # result.head(5) + # Load movies + movies = bpd.read_gbq("bqml_tutorial.movies") + + # Merge the movies df with the previously created predicted df + merged_df = bpd.merge(predicted, movies, left_on="item_id", right_on="movie_id") + + # Separate users and predicted data, setting the index to 'movie_id' + users = merged_df[["user_id", "movie_id"]].set_index("movie_id") + + # Take the predicted data and sort it in descending order by 'predicted_rating', setting the index to 'movie_id' + sort_data = ( + merged_df[["movie_title", "genre", "predicted_rating", "movie_id"]] + .sort_values(by="predicted_rating", ascending=False) + .set_index("movie_id") + ) + + # re-merge the separated dfs by index + merged_user = sort_data.join(users, how="outer") + + # group the users and set the user_id as the index + merged_user.groupby("user_id").head(5).set_index("user_id").sort_index() + print(merged_user) # Output: + # movie_title genre predicted_rating + # user_id + # 1 Saving Private Ryan (1998) Action|Drama|War 5.19326 + # 1 Fargo (1996) Crime|Drama|Thriller 4.996954 + # 1 Driving Miss Daisy (1989) Drama 4.983671 + # 1 Ben-Hur (1959) Action|Adventure|Drama 4.877622 + # 1 Schindler's List (1993) Drama|War 4.802336 + # 2 Saving Private Ryan (1998) Action|Drama|War 5.19326 + # 2 Braveheart (1995) Action|Drama|War 5.174145 + # 2 Gladiator (2000) Action|Drama 5.066372 + # 2 On Golden Pond (1981) Drama 5.01198 + # 2 Driving Miss Daisy (1989) Drama 4.983671 # [END bigframes_dataframe_bqml_mf_explicit_recommend_model] pass From 74c0d853b8616b154bfe014433eb56fc6d9ebf1b Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 5 May 2025 15:00:31 +0000 Subject: [PATCH 08/15] remove implicit snippets --- samples/snippets/mf_implicit_model_test.py | 50 ---------------------- 1 file changed, 50 deletions(-) delete mode 100644 samples/snippets/mf_implicit_model_test.py diff --git a/samples/snippets/mf_implicit_model_test.py b/samples/snippets/mf_implicit_model_test.py deleted file mode 100644 index 7203d5e591..0000000000 --- a/samples/snippets/mf_implicit_model_test.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (t -# you may not use this file except in compliance wi -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in -# distributed under the License is distributed on a -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit -# See the License for the specific language governi -# limitations under the License. - - -def test_implicit_matrix_factorization(random_model_id: str) -> None: - # [START bigframes_dataframe_mf_implicit_data] - from bigframes.ml import decomposition - import bigframes.pandas as bpd - - # sample data must be created from joined data and then grouped and ordered - bq_df = bpd.read_gbq("bqml_tutorial.analytics_session_data") - print(bq_df.peek(5)) - # Expected output: - # - # [END bigframes_dataframe_mf_implicit_data] - # [START bigframes_dataframe_mf_implicit_model] - rating_calculation = 0.3 * (1 + (bq_df["session_duration"] - 57937) / 57937) - filtered_bq_df = bq_df[rating_calculation < 1].assign( - rating=rating_calculation[rating_calculation < 1] - ) - model = decomposition.MatrixFactorization( - num_factors=15, - feedback_type="implicit", - user_col="visitorId", - item_col="contentId", - rating_col="rating", - l2_reg=30, - ) - model.fit(filtered_bq_df) - # [END bigframes_dataframe_mf_implicit_model] - # [START bigframes_dataframe_mf_implicit_evaluate] - model.score() - # Output: - # [END bigframes_dataframe_mf_implicit_evaluate] - # [START bigframes_dataframe_mf_implicit_subset] - # [END bigframes_dataframe_mf_implicit_subset] - # [START bigframes_dataframe_mf_implicit_recommend] - # [END bigframes_dataframe_mf_implicit_recommend] - pass From 862e11815e5995ebc8cf8f1d5e8337ae2f1f656d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 7 May 2025 09:40:06 -0500 Subject: [PATCH 09/15] Update samples/snippets/mf_explicit_model_test.py --- samples/snippets/mf_explicit_model_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index 1493a7e9a8..d67c7b9199 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -45,7 +45,7 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance # 0.485403 0.395052 0.025515 0.390573 0.68343 0.68343 # [END bigframes_dataframe_bqml_mf_explicit_evaluate] - # [START bigframes_dataframe_bqml_mf_recommend_df] + # [START bigframes_dataframe_bqml_mf_explicit_recommend_df] # Use predict() to get the predicted rating for each movie for 5 users subset = bq_df.head(5) predicted = model.predict(subset) From 7f2d7f672c8ec085dcd384ee4eb03ce88e5ba98a Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 7 May 2025 10:24:07 -0500 Subject: [PATCH 10/15] add snippets to create dataset and movielens tables --- samples/snippets/mf_explicit_model_test.py | 63 ++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index d67c7b9199..b4fb2cad82 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -15,6 +15,69 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: your_model_id = random_model_id + + # [START bigframes_dataframes_bqml_mf_explicit_create_dataset] + import google.cloud.bigquery + + bqclient = google.cloud.bigquery.Client() + bqclient.create_dataset("bqml_tutorial", exists_ok=True) + # [END bigframes_dataframes_bqml_mf_explicit_create_dataset] + + # [START bigframes_dataframes_bqml_mf_explicit_upload_movielens] + import io + import zipfile + + import google.api_core.exceptions + import requests + + try: + # Check if you've already created the Movielens tables to avoid downloading + # and uploading the dataset unnecessarily. + bqclient.get_table("bqml_tutorial.ratings") + bqclient.get_table("bqml_tutorial.movies") + except google.api_core.exceptions.NotFound: + # Download the https://grouplens.org/datasets/movielens/1m/ dataset. + ml1m = requests.get("http://files.grouplens.org/datasets/movielens/ml-1m.zip") + ml1m_file = io.BytesIO(ml1m.content) + ml1m_zip = zipfile.ZipFile(ml1m_file) + + # Upload the ratings data into the ratings table. + with ml1m_zip.open("ml-1m/ratings.dat") as ratings_file: + ratings_content = ratings_file.read() + + ratings_csv = io.BytesIO(ratings_content.replace(b"::", b",")) + ratings_config = google.cloud.bigquery.LoadJobConfig() + ratings_config.source_format = "CSV" + ratings_config.write_disposition = "WRITE_TRUNCATE" + ratings_config.schema = [ + google.cloud.bigquery.SchemaField("user_id", "INT64"), + google.cloud.bigquery.SchemaField("item_id", "INT64"), + google.cloud.bigquery.SchemaField("rating", "FLOAT64"), + google.cloud.bigquery.SchemaField("timestamp", "TIMESTAMP"), + ] + bqclient.load_table_from_file( + ratings_csv, "bqml_tutorial.ratings", job_config=ratings_config + ).result() + + # Upload the movie data into the movies table. + with ml1m_zip.open("ml-1m/movies.dat") as movies_file: + movies_content = movies_file.read() + + movies_csv = io.BytesIO(movies_content.replace(b"::", b"@")) + movies_config = google.cloud.bigquery.LoadJobConfig() + movies_config.source_format = "CSV" + movies_config.field_delimiter = "@" + movies_config.write_disposition = "WRITE_TRUNCATE" + movies_config.schema = [ + google.cloud.bigquery.SchemaField("movie_id", "INT64"), + google.cloud.bigquery.SchemaField("movie_title", "STRING"), + google.cloud.bigquery.SchemaField("genre", "STRING"), + ] + bqclient.load_table_from_file( + movies_csv, "bqml_tutorial.movies", job_config=movies_config + ).result() + # [END bigframes_dataframes_bqml_mf_explicit_upload_movielens] + # [START bigframes_dataframes_bqml_mf_explicit_create] from bigframes.ml import decomposition import bigframes.pandas as bpd From 6bdfbfb29bdef9e2b44d75071865bbec6062871e Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 7 May 2025 10:27:37 -0500 Subject: [PATCH 11/15] correct the region tags --- samples/snippets/mf_explicit_model_test.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index b4fb2cad82..9189147a67 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -16,14 +16,14 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: your_model_id = random_model_id - # [START bigframes_dataframes_bqml_mf_explicit_create_dataset] + # [START bigquery_dataframes_bqml_mf_explicit_create_dataset] import google.cloud.bigquery bqclient = google.cloud.bigquery.Client() bqclient.create_dataset("bqml_tutorial", exists_ok=True) - # [END bigframes_dataframes_bqml_mf_explicit_create_dataset] + # [END bigquery_dataframes_bqml_mf_explicit_create_dataset] - # [START bigframes_dataframes_bqml_mf_explicit_upload_movielens] + # [START bigquery_dataframes_bqml_mf_explicit_upload_movielens] import io import zipfile @@ -76,9 +76,9 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: bqclient.load_table_from_file( movies_csv, "bqml_tutorial.movies", job_config=movies_config ).result() - # [END bigframes_dataframes_bqml_mf_explicit_upload_movielens] + # [END bigquery_dataframes_bqml_mf_explicit_upload_movielens] - # [START bigframes_dataframes_bqml_mf_explicit_create] + # [START bigquery_dataframes_bqml_mf_explicit_create] from bigframes.ml import decomposition import bigframes.pandas as bpd @@ -100,7 +100,7 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: model.to_gbq( your_model_id, replace=True # For example: "bqml_tutorial.mf_explicit" ) - # [END bigframes_dataframes_bqml_mf_explicit_create] + # [END bigquery_dataframes_bqml_mf_explicit_create] # [START bigframes_dataframe_bqml_mf_explicit_evaluate] # Evaluate the model using the score() function model.score() From 1847d61dbca675f159f68cc15642c2dc267fc889 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 7 May 2025 10:28:37 -0500 Subject: [PATCH 12/15] correct more region tags --- samples/snippets/mf_explicit_model_test.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index 9189147a67..a13b77263d 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -101,14 +101,14 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: your_model_id, replace=True # For example: "bqml_tutorial.mf_explicit" ) # [END bigquery_dataframes_bqml_mf_explicit_create] - # [START bigframes_dataframe_bqml_mf_explicit_evaluate] + # [START bigquery_dataframes_bqml_mf_explicit_evaluate] # Evaluate the model using the score() function model.score() # Output: # mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance # 0.485403 0.395052 0.025515 0.390573 0.68343 0.68343 - # [END bigframes_dataframe_bqml_mf_explicit_evaluate] - # [START bigframes_dataframe_bqml_mf_explicit_recommend_df] + # [END bigquery_dataframes_bqml_mf_explicit_evaluate] + # [START bigquery_dataframes_bqml_mf_explicit_recommend_df] # Use predict() to get the predicted rating for each movie for 5 users subset = bq_df.head(5) predicted = model.predict(subset) @@ -120,8 +120,8 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # 2 2.679067 5543 920 2.0 # 3 4.323458 445 3175 5.0 # 4 3.476911 5535 235 4.0 - # [END bigframes_dataframe_bqml_mf_explicit_recommend_df] - # [START bigframes_dataframe_bqml_mf_explicit_recommend_model] + # [END bigquery_dataframes_bqml_mf_explicit_recommend_df] + # [START bigquery_dataframes_bqml_mf_explicit_recommend_model] # import bigframes.bigquery as bbq # Load movies @@ -159,5 +159,5 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # 2 Gladiator (2000) Action|Drama 5.066372 # 2 On Golden Pond (1981) Drama 5.01198 # 2 Driving Miss Daisy (1989) Drama 4.983671 - # [END bigframes_dataframe_bqml_mf_explicit_recommend_model] + # [END bigquery_dataframes_bqml_mf_explicit_recommend_model] pass From 0ad94b2e09d20070ce89111a3edc8c0cfafac0e4 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Wed, 7 May 2025 14:14:54 -0500 Subject: [PATCH 13/15] Update samples/snippets/mf_explicit_model_test.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- samples/snippets/mf_explicit_model_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index a13b77263d..ba5e1fa026 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -160,4 +160,3 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # 2 On Golden Pond (1981) Drama 5.01198 # 2 Driving Miss Daisy (1989) Drama 4.983671 # [END bigquery_dataframes_bqml_mf_explicit_recommend_model] - pass From 1a81d6a452ff2dda9c503a0fe668e27669abdcb6 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Wed, 7 May 2025 14:15:25 -0500 Subject: [PATCH 14/15] Update samples/snippets/mf_explicit_model_test.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- samples/snippets/mf_explicit_model_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index ba5e1fa026..08506991c5 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -110,7 +110,7 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # [END bigquery_dataframes_bqml_mf_explicit_evaluate] # [START bigquery_dataframes_bqml_mf_explicit_recommend_df] # Use predict() to get the predicted rating for each movie for 5 users - subset = bq_df.head(5) + subset = bq_df[["user_id"]].head(5) predicted = model.predict(subset) print(predicted) # Output: From dbfadbd2536e6522e210ce2dbd1541eab6b4de3c Mon Sep 17 00:00:00 2001 From: rey-esp Date: Wed, 7 May 2025 15:50:37 -0500 Subject: [PATCH 15/15] update evaluate section --- samples/snippets/mf_explicit_model_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/mf_explicit_model_test.py b/samples/snippets/mf_explicit_model_test.py index 08506991c5..fb54b7271c 100644 --- a/samples/snippets/mf_explicit_model_test.py +++ b/samples/snippets/mf_explicit_model_test.py @@ -103,7 +103,7 @@ def test_explicit_matrix_factorization(random_model_id: str) -> None: # [END bigquery_dataframes_bqml_mf_explicit_create] # [START bigquery_dataframes_bqml_mf_explicit_evaluate] # Evaluate the model using the score() function - model.score() + model.score(bq_df) # Output: # mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance # 0.485403 0.395052 0.025515 0.390573 0.68343 0.68343