Skip to content

Commit 24b37ae

Browse files
rey-esptswast
andauthored
docs: add snippets for Matrix Factorization tutorials (#1630)
* docs: add matrix_factorization snippets * incomplete mf snippets * prep implicit * near complete tutorial * implicit create * add doc note * complete explicit tutorial * remove implicit snippets * Update samples/snippets/mf_explicit_model_test.py * add snippets to create dataset and movielens tables * correct the region tags * correct more region tags * Update samples/snippets/mf_explicit_model_test.py Co-authored-by: Tim Sweña (Swast) <[email protected]> * Update samples/snippets/mf_explicit_model_test.py Co-authored-by: Tim Sweña (Swast) <[email protected]> * update evaluate section --------- Co-authored-by: Tim Sweña (Swast) <[email protected]>
1 parent f661a5f commit 24b37ae

File tree

1 file changed

+162
-0
lines changed

1 file changed

+162
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (t
4+
# you may not use this file except in compliance wi
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in
10+
# distributed under the License is distributed on a
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit
12+
# See the License for the specific language governi
13+
# limitations under the License.
14+
15+
16+
def test_explicit_matrix_factorization(random_model_id: str) -> None:
17+
your_model_id = random_model_id
18+
19+
# [START bigquery_dataframes_bqml_mf_explicit_create_dataset]
20+
import google.cloud.bigquery
21+
22+
bqclient = google.cloud.bigquery.Client()
23+
bqclient.create_dataset("bqml_tutorial", exists_ok=True)
24+
# [END bigquery_dataframes_bqml_mf_explicit_create_dataset]
25+
26+
# [START bigquery_dataframes_bqml_mf_explicit_upload_movielens]
27+
import io
28+
import zipfile
29+
30+
import google.api_core.exceptions
31+
import requests
32+
33+
try:
34+
# Check if you've already created the Movielens tables to avoid downloading
35+
# and uploading the dataset unnecessarily.
36+
bqclient.get_table("bqml_tutorial.ratings")
37+
bqclient.get_table("bqml_tutorial.movies")
38+
except google.api_core.exceptions.NotFound:
39+
# Download the https://grouplens.org/datasets/movielens/1m/ dataset.
40+
ml1m = requests.get("http://files.grouplens.org/datasets/movielens/ml-1m.zip")
41+
ml1m_file = io.BytesIO(ml1m.content)
42+
ml1m_zip = zipfile.ZipFile(ml1m_file)
43+
44+
# Upload the ratings data into the ratings table.
45+
with ml1m_zip.open("ml-1m/ratings.dat") as ratings_file:
46+
ratings_content = ratings_file.read()
47+
48+
ratings_csv = io.BytesIO(ratings_content.replace(b"::", b","))
49+
ratings_config = google.cloud.bigquery.LoadJobConfig()
50+
ratings_config.source_format = "CSV"
51+
ratings_config.write_disposition = "WRITE_TRUNCATE"
52+
ratings_config.schema = [
53+
google.cloud.bigquery.SchemaField("user_id", "INT64"),
54+
google.cloud.bigquery.SchemaField("item_id", "INT64"),
55+
google.cloud.bigquery.SchemaField("rating", "FLOAT64"),
56+
google.cloud.bigquery.SchemaField("timestamp", "TIMESTAMP"),
57+
]
58+
bqclient.load_table_from_file(
59+
ratings_csv, "bqml_tutorial.ratings", job_config=ratings_config
60+
).result()
61+
62+
# Upload the movie data into the movies table.
63+
with ml1m_zip.open("ml-1m/movies.dat") as movies_file:
64+
movies_content = movies_file.read()
65+
66+
movies_csv = io.BytesIO(movies_content.replace(b"::", b"@"))
67+
movies_config = google.cloud.bigquery.LoadJobConfig()
68+
movies_config.source_format = "CSV"
69+
movies_config.field_delimiter = "@"
70+
movies_config.write_disposition = "WRITE_TRUNCATE"
71+
movies_config.schema = [
72+
google.cloud.bigquery.SchemaField("movie_id", "INT64"),
73+
google.cloud.bigquery.SchemaField("movie_title", "STRING"),
74+
google.cloud.bigquery.SchemaField("genre", "STRING"),
75+
]
76+
bqclient.load_table_from_file(
77+
movies_csv, "bqml_tutorial.movies", job_config=movies_config
78+
).result()
79+
# [END bigquery_dataframes_bqml_mf_explicit_upload_movielens]
80+
81+
# [START bigquery_dataframes_bqml_mf_explicit_create]
82+
from bigframes.ml import decomposition
83+
import bigframes.pandas as bpd
84+
85+
# Load data from BigQuery
86+
bq_df = bpd.read_gbq(
87+
"bqml_tutorial.ratings", columns=("user_id", "item_id", "rating")
88+
)
89+
90+
# Create the Matrix Factorization model
91+
model = decomposition.MatrixFactorization(
92+
num_factors=34,
93+
feedback_type="explicit",
94+
user_col="user_id",
95+
item_col="item_id",
96+
rating_col="rating",
97+
l2_reg=9.83,
98+
)
99+
model.fit(bq_df)
100+
model.to_gbq(
101+
your_model_id, replace=True # For example: "bqml_tutorial.mf_explicit"
102+
)
103+
# [END bigquery_dataframes_bqml_mf_explicit_create]
104+
# [START bigquery_dataframes_bqml_mf_explicit_evaluate]
105+
# Evaluate the model using the score() function
106+
model.score(bq_df)
107+
# Output:
108+
# mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance
109+
# 0.485403 0.395052 0.025515 0.390573 0.68343 0.68343
110+
# [END bigquery_dataframes_bqml_mf_explicit_evaluate]
111+
# [START bigquery_dataframes_bqml_mf_explicit_recommend_df]
112+
# Use predict() to get the predicted rating for each movie for 5 users
113+
subset = bq_df[["user_id"]].head(5)
114+
predicted = model.predict(subset)
115+
print(predicted)
116+
# Output:
117+
# predicted_rating user_id item_id rating
118+
# 0 4.206146 4354 968 4.0
119+
# 1 4.853099 3622 3521 5.0
120+
# 2 2.679067 5543 920 2.0
121+
# 3 4.323458 445 3175 5.0
122+
# 4 3.476911 5535 235 4.0
123+
# [END bigquery_dataframes_bqml_mf_explicit_recommend_df]
124+
# [START bigquery_dataframes_bqml_mf_explicit_recommend_model]
125+
# import bigframes.bigquery as bbq
126+
127+
# Load movies
128+
movies = bpd.read_gbq("bqml_tutorial.movies")
129+
130+
# Merge the movies df with the previously created predicted df
131+
merged_df = bpd.merge(predicted, movies, left_on="item_id", right_on="movie_id")
132+
133+
# Separate users and predicted data, setting the index to 'movie_id'
134+
users = merged_df[["user_id", "movie_id"]].set_index("movie_id")
135+
136+
# Take the predicted data and sort it in descending order by 'predicted_rating', setting the index to 'movie_id'
137+
sort_data = (
138+
merged_df[["movie_title", "genre", "predicted_rating", "movie_id"]]
139+
.sort_values(by="predicted_rating", ascending=False)
140+
.set_index("movie_id")
141+
)
142+
143+
# re-merge the separated dfs by index
144+
merged_user = sort_data.join(users, how="outer")
145+
146+
# group the users and set the user_id as the index
147+
merged_user.groupby("user_id").head(5).set_index("user_id").sort_index()
148+
print(merged_user)
149+
# Output:
150+
# movie_title genre predicted_rating
151+
# user_id
152+
# 1 Saving Private Ryan (1998) Action|Drama|War 5.19326
153+
# 1 Fargo (1996) Crime|Drama|Thriller 4.996954
154+
# 1 Driving Miss Daisy (1989) Drama 4.983671
155+
# 1 Ben-Hur (1959) Action|Adventure|Drama 4.877622
156+
# 1 Schindler's List (1993) Drama|War 4.802336
157+
# 2 Saving Private Ryan (1998) Action|Drama|War 5.19326
158+
# 2 Braveheart (1995) Action|Drama|War 5.174145
159+
# 2 Gladiator (2000) Action|Drama 5.066372
160+
# 2 On Golden Pond (1981) Drama 5.01198
161+
# 2 Driving Miss Daisy (1989) Drama 4.983671
162+
# [END bigquery_dataframes_bqml_mf_explicit_recommend_model]

0 commit comments

Comments
 (0)