|
22 | 22 | import torch |
23 | 23 |
|
24 | 24 | import PIL |
25 | | -from datasets import load_dataset |
26 | 25 | from diffusers import ( |
27 | 26 | AutoencoderKL, |
28 | 27 | DDIMPipeline, |
|
47 | 46 | VQModel, |
48 | 47 | ) |
49 | 48 | from diffusers.pipeline_utils import DiffusionPipeline |
50 | | -from diffusers.testing_utils import floats_tensor, slow, torch_device |
| 49 | +from diffusers.testing_utils import floats_tensor, load_image, slow, torch_device |
51 | 50 | from PIL import Image |
52 | 51 | from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer |
53 | 52 |
|
@@ -168,7 +167,7 @@ def dummy_text_encoder(self): |
168 | 167 | @property |
169 | 168 | def dummy_safety_checker(self): |
170 | 169 | def check(images, *args, **kwargs): |
171 | | - return images, False |
| 170 | + return images, [False] * len(images) |
172 | 171 |
|
173 | 172 | return check |
174 | 173 |
|
@@ -708,6 +707,13 @@ def tearDown(self): |
708 | 707 | gc.collect() |
709 | 708 | torch.cuda.empty_cache() |
710 | 709 |
|
| 710 | + @property |
| 711 | + def dummy_safety_checker(self): |
| 712 | + def check(images, *args, **kwargs): |
| 713 | + return images, [False] * len(images) |
| 714 | + |
| 715 | + return check |
| 716 | + |
711 | 717 | def test_from_pretrained_save_pretrained(self): |
712 | 718 | # 1. Load models |
713 | 719 | model = UNet2DModel( |
@@ -1139,144 +1145,164 @@ def test_stable_diffusion_memory_chunking(self): |
1139 | 1145 |
|
1140 | 1146 | @slow |
1141 | 1147 | @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") |
1142 | | - def test_stable_diffusion_img2img_pipeline(self): |
1143 | | - ds = load_dataset( |
1144 | | - "imagefolder", |
1145 | | - data_files={ |
1146 | | - "input": [ |
1147 | | - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
1148 | | - "/img2img/sketch-mountains-input.jpg" |
1149 | | - ], |
1150 | | - "output": [ |
1151 | | - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
1152 | | - "/img2img/fantasy_landscape.png" |
1153 | | - ], |
1154 | | - }, |
| 1148 | + def test_stable_diffusion_text2img_pipeline(self): |
| 1149 | + expected_image = load_image( |
| 1150 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1151 | + "/text2img/astronaut_riding_a_horse.png" |
1155 | 1152 | ) |
| 1153 | + expected_image = np.array(expected_image, dtype=np.float32) / 255.0 |
| 1154 | + |
| 1155 | + model_id = "CompVis/stable-diffusion-v1-4" |
| 1156 | + pipe = StableDiffusionPipeline.from_pretrained( |
| 1157 | + model_id, |
| 1158 | + safety_checker=self.dummy_safety_checker, |
| 1159 | + use_auth_token=True, |
| 1160 | + ) |
| 1161 | + pipe.to(torch_device) |
| 1162 | + pipe.set_progress_bar_config(disable=None) |
| 1163 | + pipe.enable_attention_slicing() |
| 1164 | + |
| 1165 | + prompt = "astronaut riding a horse" |
| 1166 | + |
| 1167 | + generator = torch.Generator(device=torch_device).manual_seed(0) |
| 1168 | + output = pipe(prompt=prompt, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np") |
| 1169 | + image = output.images[0] |
1156 | 1170 |
|
1157 | | - init_image = ds["input"]["image"][0].resize((768, 512)) |
1158 | | - output_image = ds["output"]["image"][0].resize((768, 512)) |
| 1171 | + assert image.shape == (512, 512, 3) |
| 1172 | + assert np.abs(expected_image - image).max() < 1e-2 |
| 1173 | + |
| 1174 | + @slow |
| 1175 | + @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") |
| 1176 | + def test_stable_diffusion_img2img_pipeline(self): |
| 1177 | + init_image = load_image( |
| 1178 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1179 | + "/img2img/sketch-mountains-input.jpg" |
| 1180 | + ) |
| 1181 | + expected_image = load_image( |
| 1182 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1183 | + "/img2img/fantasy_landscape.png" |
| 1184 | + ) |
| 1185 | + init_image = init_image.resize((768, 512)) |
| 1186 | + expected_image = np.array(expected_image, dtype=np.float32) / 255.0 |
1159 | 1187 |
|
1160 | 1188 | model_id = "CompVis/stable-diffusion-v1-4" |
1161 | 1189 | pipe = StableDiffusionImg2ImgPipeline.from_pretrained( |
1162 | 1190 | model_id, |
| 1191 | + safety_checker=self.dummy_safety_checker, |
1163 | 1192 | use_auth_token=True, |
1164 | 1193 | ) |
1165 | 1194 | pipe.to(torch_device) |
1166 | | - pipe.enable_attention_slicing() |
1167 | 1195 | pipe.set_progress_bar_config(disable=None) |
| 1196 | + pipe.enable_attention_slicing() |
1168 | 1197 |
|
1169 | 1198 | prompt = "A fantasy landscape, trending on artstation" |
1170 | 1199 |
|
1171 | 1200 | generator = torch.Generator(device=torch_device).manual_seed(0) |
1172 | | - with torch.autocast("cuda"): |
1173 | | - output = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5, generator=generator) |
| 1201 | + output = pipe( |
| 1202 | + prompt=prompt, |
| 1203 | + init_image=init_image, |
| 1204 | + strength=0.75, |
| 1205 | + guidance_scale=7.5, |
| 1206 | + generator=generator, |
| 1207 | + output_type="np", |
| 1208 | + ) |
1174 | 1209 | image = output.images[0] |
1175 | 1210 |
|
1176 | | - expected_array = np.array(output_image) / 255.0 |
1177 | | - sampled_array = np.array(image) / 255.0 |
| 1211 | + Image.fromarray((image * 255).round().astype("uint8")).save("fantasy_landscape.png") |
1178 | 1212 |
|
1179 | | - assert sampled_array.shape == (512, 768, 3) |
1180 | | - assert np.max(np.abs(sampled_array - expected_array)) < 1e-4 |
| 1213 | + assert image.shape == (512, 768, 3) |
| 1214 | + assert np.abs(expected_image - image).max() < 1e-2 |
1181 | 1215 |
|
1182 | 1216 | @slow |
1183 | 1217 | @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") |
1184 | 1218 | def test_stable_diffusion_img2img_pipeline_k_lms(self): |
1185 | | - ds = load_dataset( |
1186 | | - "imagefolder", |
1187 | | - data_files={ |
1188 | | - "input": [ |
1189 | | - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
1190 | | - "/img2img/sketch-mountains-input.jpg" |
1191 | | - ], |
1192 | | - "output": [ |
1193 | | - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
1194 | | - "/img2img/fantasy_landscape_k_lms.png" |
1195 | | - ], |
1196 | | - }, |
| 1219 | + init_image = load_image( |
| 1220 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1221 | + "/img2img/sketch-mountains-input.jpg" |
1197 | 1222 | ) |
1198 | | - |
1199 | | - init_image = ds["input"]["image"][0].resize((768, 512)) |
1200 | | - output_image = ds["output"]["image"][0].resize((768, 512)) |
| 1223 | + expected_image = load_image( |
| 1224 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1225 | + "/img2img/fantasy_landscape_k_lms.png" |
| 1226 | + ) |
| 1227 | + init_image = init_image.resize((768, 512)) |
| 1228 | + expected_image = np.array(expected_image, dtype=np.float32) / 255.0 |
1201 | 1229 |
|
1202 | 1230 | lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") |
1203 | 1231 |
|
1204 | 1232 | model_id = "CompVis/stable-diffusion-v1-4" |
1205 | 1233 | pipe = StableDiffusionImg2ImgPipeline.from_pretrained( |
1206 | 1234 | model_id, |
1207 | 1235 | scheduler=lms, |
| 1236 | + safety_checker=self.dummy_safety_checker, |
1208 | 1237 | use_auth_token=True, |
1209 | 1238 | ) |
1210 | | - pipe.enable_attention_slicing() |
1211 | 1239 | pipe.to(torch_device) |
1212 | 1240 | pipe.set_progress_bar_config(disable=None) |
| 1241 | + pipe.enable_attention_slicing() |
1213 | 1242 |
|
1214 | 1243 | prompt = "A fantasy landscape, trending on artstation" |
1215 | 1244 |
|
1216 | 1245 | generator = torch.Generator(device=torch_device).manual_seed(0) |
1217 | | - with torch.autocast("cuda"): |
1218 | | - output = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5, generator=generator) |
| 1246 | + output = pipe( |
| 1247 | + prompt=prompt, |
| 1248 | + init_image=init_image, |
| 1249 | + strength=0.75, |
| 1250 | + guidance_scale=7.5, |
| 1251 | + generator=generator, |
| 1252 | + output_type="np", |
| 1253 | + ) |
1219 | 1254 | image = output.images[0] |
1220 | 1255 |
|
1221 | | - expected_array = np.array(output_image) / 255.0 |
1222 | | - sampled_array = np.array(image) / 255.0 |
| 1256 | + Image.fromarray((image * 255).round().astype("uint8")).save("fantasy_landscape_k_lms.png") |
1223 | 1257 |
|
1224 | | - assert sampled_array.shape == (512, 768, 3) |
1225 | | - assert np.max(np.abs(sampled_array - expected_array)) < 1e-4 |
| 1258 | + assert image.shape == (512, 768, 3) |
| 1259 | + assert np.abs(expected_image - image).max() < 1e-2 |
1226 | 1260 |
|
1227 | 1261 | @slow |
1228 | 1262 | @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") |
1229 | 1263 | def test_stable_diffusion_inpaint_pipeline(self): |
1230 | | - ds = load_dataset( |
1231 | | - "imagefolder", |
1232 | | - data_files={ |
1233 | | - "input": [ |
1234 | | - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
1235 | | - "/in_paint/overture-creations-5sI6fQgYIuo.png" |
1236 | | - ], |
1237 | | - "mask": [ |
1238 | | - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
1239 | | - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" |
1240 | | - ], |
1241 | | - "output": [ |
1242 | | - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
1243 | | - "/in_paint/red_cat_sitting_on_a_parking_bench.png" |
1244 | | - ], |
1245 | | - }, |
| 1264 | + init_image = load_image( |
| 1265 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1266 | + "/in_paint/overture-creations-5sI6fQgYIuo.png" |
1246 | 1267 | ) |
1247 | | - |
1248 | | - init_image = ds["input"]["image"][0].resize((768, 512)) |
1249 | | - mask_image = ds["mask"]["image"][0].resize((768, 512)) |
1250 | | - output_image = ds["output"]["image"][0].resize((768, 512)) |
| 1268 | + mask_image = load_image( |
| 1269 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1270 | + "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" |
| 1271 | + ) |
| 1272 | + expected_image = load_image( |
| 1273 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 1274 | + "/in_paint/red_cat_sitting_on_a_park_bench.png" |
| 1275 | + ) |
| 1276 | + expected_image = np.array(expected_image, dtype=np.float32) / 255.0 |
1251 | 1277 |
|
1252 | 1278 | model_id = "CompVis/stable-diffusion-v1-4" |
1253 | 1279 | pipe = StableDiffusionInpaintPipeline.from_pretrained( |
1254 | 1280 | model_id, |
| 1281 | + safety_checker=self.dummy_safety_checker, |
1255 | 1282 | use_auth_token=True, |
1256 | 1283 | ) |
1257 | 1284 | pipe.to(torch_device) |
1258 | | - pipe.enable_attention_slicing() |
1259 | 1285 | pipe.set_progress_bar_config(disable=None) |
| 1286 | + pipe.enable_attention_slicing() |
1260 | 1287 |
|
1261 | | - prompt = "A red cat sitting on a parking bench" |
| 1288 | + prompt = "A red cat sitting on a park bench" |
1262 | 1289 |
|
1263 | 1290 | generator = torch.Generator(device=torch_device).manual_seed(0) |
1264 | | - with torch.autocast("cuda"): |
1265 | | - output = pipe( |
1266 | | - prompt=prompt, |
1267 | | - init_image=init_image, |
1268 | | - mask_image=mask_image, |
1269 | | - strength=0.75, |
1270 | | - guidance_scale=7.5, |
1271 | | - generator=generator, |
1272 | | - ) |
| 1291 | + output = pipe( |
| 1292 | + prompt=prompt, |
| 1293 | + init_image=init_image, |
| 1294 | + mask_image=mask_image, |
| 1295 | + strength=0.75, |
| 1296 | + guidance_scale=7.5, |
| 1297 | + generator=generator, |
| 1298 | + output_type="np", |
| 1299 | + ) |
1273 | 1300 | image = output.images[0] |
1274 | 1301 |
|
1275 | | - expected_array = np.array(output_image) / 255.0 |
1276 | | - sampled_array = np.array(image) / 255.0 |
| 1302 | + Image.fromarray((image * 255).round().astype("uint8")).save("red_cat_sitting_on_a_park_bench.png") |
1277 | 1303 |
|
1278 | | - assert sampled_array.shape == (512, 768, 3) |
1279 | | - assert np.max(np.abs(sampled_array - expected_array)) < 1e-3 |
| 1304 | + assert image.shape == (512, 512, 3) |
| 1305 | + assert np.abs(expected_image - image).max() < 1e-2 |
1280 | 1306 |
|
1281 | 1307 | @slow |
1282 | 1308 | def test_stable_diffusion_onnx(self): |
|
0 commit comments