|
| 1 | +# Copyright 2024 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (t |
| 4 | +# you may not use this file except in compliance wi |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in |
| 10 | +# distributed under the License is distributed on a |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit |
| 12 | +# See the License for the specific language governi |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | + |
| 16 | +def test_explicit_matrix_factorization(random_model_id: str) -> None: |
| 17 | + your_model_id = random_model_id |
| 18 | + |
| 19 | + # [START bigquery_dataframes_bqml_mf_explicit_create_dataset] |
| 20 | + import google.cloud.bigquery |
| 21 | + |
| 22 | + bqclient = google.cloud.bigquery.Client() |
| 23 | + bqclient.create_dataset("bqml_tutorial", exists_ok=True) |
| 24 | + # [END bigquery_dataframes_bqml_mf_explicit_create_dataset] |
| 25 | + |
| 26 | + # [START bigquery_dataframes_bqml_mf_explicit_upload_movielens] |
| 27 | + import io |
| 28 | + import zipfile |
| 29 | + |
| 30 | + import google.api_core.exceptions |
| 31 | + import requests |
| 32 | + |
| 33 | + try: |
| 34 | + # Check if you've already created the Movielens tables to avoid downloading |
| 35 | + # and uploading the dataset unnecessarily. |
| 36 | + bqclient.get_table("bqml_tutorial.ratings") |
| 37 | + bqclient.get_table("bqml_tutorial.movies") |
| 38 | + except google.api_core.exceptions.NotFound: |
| 39 | + # Download the https://grouplens.org/datasets/movielens/1m/ dataset. |
| 40 | + ml1m = requests.get("http://files.grouplens.org/datasets/movielens/ml-1m.zip") |
| 41 | + ml1m_file = io.BytesIO(ml1m.content) |
| 42 | + ml1m_zip = zipfile.ZipFile(ml1m_file) |
| 43 | + |
| 44 | + # Upload the ratings data into the ratings table. |
| 45 | + with ml1m_zip.open("ml-1m/ratings.dat") as ratings_file: |
| 46 | + ratings_content = ratings_file.read() |
| 47 | + |
| 48 | + ratings_csv = io.BytesIO(ratings_content.replace(b"::", b",")) |
| 49 | + ratings_config = google.cloud.bigquery.LoadJobConfig() |
| 50 | + ratings_config.source_format = "CSV" |
| 51 | + ratings_config.write_disposition = "WRITE_TRUNCATE" |
| 52 | + ratings_config.schema = [ |
| 53 | + google.cloud.bigquery.SchemaField("user_id", "INT64"), |
| 54 | + google.cloud.bigquery.SchemaField("item_id", "INT64"), |
| 55 | + google.cloud.bigquery.SchemaField("rating", "FLOAT64"), |
| 56 | + google.cloud.bigquery.SchemaField("timestamp", "TIMESTAMP"), |
| 57 | + ] |
| 58 | + bqclient.load_table_from_file( |
| 59 | + ratings_csv, "bqml_tutorial.ratings", job_config=ratings_config |
| 60 | + ).result() |
| 61 | + |
| 62 | + # Upload the movie data into the movies table. |
| 63 | + with ml1m_zip.open("ml-1m/movies.dat") as movies_file: |
| 64 | + movies_content = movies_file.read() |
| 65 | + |
| 66 | + movies_csv = io.BytesIO(movies_content.replace(b"::", b"@")) |
| 67 | + movies_config = google.cloud.bigquery.LoadJobConfig() |
| 68 | + movies_config.source_format = "CSV" |
| 69 | + movies_config.field_delimiter = "@" |
| 70 | + movies_config.write_disposition = "WRITE_TRUNCATE" |
| 71 | + movies_config.schema = [ |
| 72 | + google.cloud.bigquery.SchemaField("movie_id", "INT64"), |
| 73 | + google.cloud.bigquery.SchemaField("movie_title", "STRING"), |
| 74 | + google.cloud.bigquery.SchemaField("genre", "STRING"), |
| 75 | + ] |
| 76 | + bqclient.load_table_from_file( |
| 77 | + movies_csv, "bqml_tutorial.movies", job_config=movies_config |
| 78 | + ).result() |
| 79 | + # [END bigquery_dataframes_bqml_mf_explicit_upload_movielens] |
| 80 | + |
| 81 | + # [START bigquery_dataframes_bqml_mf_explicit_create] |
| 82 | + from bigframes.ml import decomposition |
| 83 | + import bigframes.pandas as bpd |
| 84 | + |
| 85 | + # Load data from BigQuery |
| 86 | + bq_df = bpd.read_gbq( |
| 87 | + "bqml_tutorial.ratings", columns=("user_id", "item_id", "rating") |
| 88 | + ) |
| 89 | + |
| 90 | + # Create the Matrix Factorization model |
| 91 | + model = decomposition.MatrixFactorization( |
| 92 | + num_factors=34, |
| 93 | + feedback_type="explicit", |
| 94 | + user_col="user_id", |
| 95 | + item_col="item_id", |
| 96 | + rating_col="rating", |
| 97 | + l2_reg=9.83, |
| 98 | + ) |
| 99 | + model.fit(bq_df) |
| 100 | + model.to_gbq( |
| 101 | + your_model_id, replace=True # For example: "bqml_tutorial.mf_explicit" |
| 102 | + ) |
| 103 | + # [END bigquery_dataframes_bqml_mf_explicit_create] |
| 104 | + # [START bigquery_dataframes_bqml_mf_explicit_evaluate] |
| 105 | + # Evaluate the model using the score() function |
| 106 | + model.score(bq_df) |
| 107 | + # Output: |
| 108 | + # mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance |
| 109 | + # 0.485403 0.395052 0.025515 0.390573 0.68343 0.68343 |
| 110 | + # [END bigquery_dataframes_bqml_mf_explicit_evaluate] |
| 111 | + # [START bigquery_dataframes_bqml_mf_explicit_recommend_df] |
| 112 | + # Use predict() to get the predicted rating for each movie for 5 users |
| 113 | + subset = bq_df[["user_id"]].head(5) |
| 114 | + predicted = model.predict(subset) |
| 115 | + print(predicted) |
| 116 | + # Output: |
| 117 | + # predicted_rating user_id item_id rating |
| 118 | + # 0 4.206146 4354 968 4.0 |
| 119 | + # 1 4.853099 3622 3521 5.0 |
| 120 | + # 2 2.679067 5543 920 2.0 |
| 121 | + # 3 4.323458 445 3175 5.0 |
| 122 | + # 4 3.476911 5535 235 4.0 |
| 123 | + # [END bigquery_dataframes_bqml_mf_explicit_recommend_df] |
| 124 | + # [START bigquery_dataframes_bqml_mf_explicit_recommend_model] |
| 125 | + # import bigframes.bigquery as bbq |
| 126 | + |
| 127 | + # Load movies |
| 128 | + movies = bpd.read_gbq("bqml_tutorial.movies") |
| 129 | + |
| 130 | + # Merge the movies df with the previously created predicted df |
| 131 | + merged_df = bpd.merge(predicted, movies, left_on="item_id", right_on="movie_id") |
| 132 | + |
| 133 | + # Separate users and predicted data, setting the index to 'movie_id' |
| 134 | + users = merged_df[["user_id", "movie_id"]].set_index("movie_id") |
| 135 | + |
| 136 | + # Take the predicted data and sort it in descending order by 'predicted_rating', setting the index to 'movie_id' |
| 137 | + sort_data = ( |
| 138 | + merged_df[["movie_title", "genre", "predicted_rating", "movie_id"]] |
| 139 | + .sort_values(by="predicted_rating", ascending=False) |
| 140 | + .set_index("movie_id") |
| 141 | + ) |
| 142 | + |
| 143 | + # re-merge the separated dfs by index |
| 144 | + merged_user = sort_data.join(users, how="outer") |
| 145 | + |
| 146 | + # group the users and set the user_id as the index |
| 147 | + merged_user.groupby("user_id").head(5).set_index("user_id").sort_index() |
| 148 | + print(merged_user) |
| 149 | + # Output: |
| 150 | + # movie_title genre predicted_rating |
| 151 | + # user_id |
| 152 | + # 1 Saving Private Ryan (1998) Action|Drama|War 5.19326 |
| 153 | + # 1 Fargo (1996) Crime|Drama|Thriller 4.996954 |
| 154 | + # 1 Driving Miss Daisy (1989) Drama 4.983671 |
| 155 | + # 1 Ben-Hur (1959) Action|Adventure|Drama 4.877622 |
| 156 | + # 1 Schindler's List (1993) Drama|War 4.802336 |
| 157 | + # 2 Saving Private Ryan (1998) Action|Drama|War 5.19326 |
| 158 | + # 2 Braveheart (1995) Action|Drama|War 5.174145 |
| 159 | + # 2 Gladiator (2000) Action|Drama 5.066372 |
| 160 | + # 2 On Golden Pond (1981) Drama 5.01198 |
| 161 | + # 2 Driving Miss Daisy (1989) Drama 4.983671 |
| 162 | + # [END bigquery_dataframes_bqml_mf_explicit_recommend_model] |
0 commit comments