add multi-run in single execution (huggingface#812)

PhaneeshB · web-flow · commit 6eb47c12d158 · 2023-01-13T11:12:43.000-08:00
diff --git a/shark/examples/shark_inference/stable_diffusion/main.py b/shark/examples/shark_inference/stable_diffusion/main.py
@@ -86,16 +86,6 @@ def end_profiling(device):
     # Scale for classifier-free guidance
     guidance_scale = torch.tensor(args.guidance_scale).to(torch.float32)
 
-    # Handle out of range seeds.
-    uint32_info = np.iinfo(np.uint32)
-    uint32_min, uint32_max = uint32_info.min, uint32_info.max
-    seed = args.seed
-    if seed < uint32_min or seed >= uint32_max:
-        seed = randint(uint32_min, uint32_max)
-    generator = torch.manual_seed(
-        seed
-    )  # Seed generator to create the inital latent noise
-
     # TODO: Add support for batch_size > 1.
     batch_size = len(prompt)
     if batch_size != 1:
@@ -144,139 +134,157 @@ def end_profiling(device):
                 "stabilityai/stable-diffusion-2-1-base",
                 subfolder="scheduler",
             )
+    for run in range(args.runs):
+        # Handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        seed = args.seed
+        if run >= 1 or seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(
+            seed
+        )  # Seed generator to create the inital latent noise
+
+        # create a random initial latent.
+        latents = torch.randn(
+            (batch_size, 4, height // 8, width // 8),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+        if run == 0:
+            # Warmup phase to improve performance.
+            if args.warmup_count >= 1:
+                vae_warmup_input = torch.clone(latents).detach().numpy()
+                clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
+            for i in range(args.warmup_count):
+                vae("forward", (vae_warmup_input,))
+                clip("forward", (clip_warmup_input,))
+
+        start = time.time()
+        if run == 0:
+            text_input = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=args.max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = tokenizer(
+                neg_prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input = torch.cat(
+                [uncond_input.input_ids, text_input.input_ids]
+            )
 
-    # create a random initial latent.
-    latents = torch.randn(
-        (batch_size, 4, height // 8, width // 8),
-        generator=generator,
-        dtype=torch.float32,
-    ).to(dtype)
-    # Warmup phase to improve performance.
-    if args.warmup_count >= 1:
-        vae_warmup_input = torch.clone(latents).detach().numpy()
-        clip_warmup_input = torch.randint(1, 2, (2, args.max_length))
-    for i in range(args.warmup_count):
-        vae("forward", (vae_warmup_input,))
-        clip("forward", (clip_warmup_input,))
-
-    start = time.time()
-
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=args.max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-        neg_prompt,
-        padding="max_length",
-        max_length=max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])
-
-    clip_inf_start = time.time()
-    text_embeddings = clip("forward", (text_input,))
-    clip_inf_end = time.time()
-    text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
-    text_embeddings_numpy = text_embeddings.detach().numpy()
-
-    scheduler.set_timesteps(num_inference_steps)
-    scheduler.is_scale_input_called = True
-
-    latents = latents * scheduler.init_noise_sigma
-
-    avg_ms = 0
-    for i, t in tqdm(enumerate(scheduler.timesteps), disable=args.hide_steps):
-        step_start = time.time()
-        if not args.hide_steps:
-            print(f"i = {i} t = {t}", end="")
-        timestep = torch.tensor([t]).to(dtype).detach().numpy()
-        latent_model_input = scheduler.scale_model_input(latents, t)
-        if cpu_scheduling:
-            latent_model_input = latent_model_input.detach().numpy()
-
-        profile_device = start_profiling(file_path="unet.rdc")
-
-        noise_pred = unet(
-            "forward",
-            (
-                latent_model_input,
-                timestep,
-                text_embeddings_numpy,
-                guidance_scale,
-            ),
-            send_to_host=False,
-        )
-
-        end_profiling(profile_device)
+            clip_inf_start = time.time()
+            text_embeddings = clip("forward", (text_input,))
+            clip_inf_end = time.time()
+            text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+            text_embeddings_numpy = text_embeddings.detach().numpy()
+
+            scheduler.set_timesteps(num_inference_steps)
+            scheduler.is_scale_input_called = True
+
+        latents = latents * scheduler.init_noise_sigma
+
+        avg_ms = 0
+        for i, t in tqdm(
+            enumerate(scheduler.timesteps), disable=args.hide_steps
+        ):
+            step_start = time.time()
+            if not args.hide_steps:
+                print(f"i = {i} t = {t}", end="")
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            latent_model_input = scheduler.scale_model_input(latents, t)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            profile_device = start_profiling(file_path="unet.rdc")
+
+            noise_pred = unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                ),
+                send_to_host=False,
+            )
 
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = scheduler.step(noise_pred, t, latents).prev_sample
+            else:
+                latents = scheduler.step(noise_pred, t, latents)
+            step_time = time.time() - step_start
+            avg_ms += step_time
+            step_ms = int((step_time) * 1000)
+            if not args.hide_steps:
+                print(f" ({step_ms}ms)")
+
+        # scale and decode the image latents with vae
+        if args.use_base_vae:
+            latents = 1 / 0.18215 * latents
+        latents_numpy = latents
         if cpu_scheduling:
-            noise_pred = torch.from_numpy(noise_pred.to_host())
-            latents = scheduler.step(noise_pred, t, latents).prev_sample
+            latents_numpy = latents.detach().numpy()
+        profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = vae("forward", (latents_numpy,))
+        vae_end = time.time()
+        end_profiling(profile_device)
+        if args.use_base_vae:
+            image = torch.from_numpy(images)
+            image = (image.detach().cpu() * 255.0).numpy()
+            images = image.round()
+        end_time = time.time()
+
+        avg_ms = 1000 * avg_ms / args.steps
+        clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
+        vae_inf_time = (vae_end - vae_start) * 1000
+        total_time = end_time - start
+
+        print(f"\nStats for run {run}:")
+        print(f"Average step time: {avg_ms}ms/it")
+        print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
+        print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
+        print(f"\nTotal image generation time: {total_time}sec")
+
+        transform = T.ToPILImage()
+        pil_images = [
+            transform(image)
+            for image in torch.from_numpy(images).to(torch.uint8)
+        ]
+
+        if args.output_dir is not None:
+            output_path = Path(args.output_dir)
+            output_path.mkdir(parents=True, exist_ok=True)
         else:
-            latents = scheduler.step(noise_pred, t, latents)
-        step_time = time.time() - step_start
-        avg_ms += step_time
-        step_ms = int((step_time) * 1000)
-        if not args.hide_steps:
-            print(f" ({step_ms}ms)")
-
-    # scale and decode the image latents with vae
-    if args.use_base_vae:
-        latents = 1 / 0.18215 * latents
-    latents_numpy = latents
-    if cpu_scheduling:
-        latents_numpy = latents.detach().numpy()
-    profile_device = start_profiling(file_path="vae.rdc")
-    vae_start = time.time()
-    images = vae("forward", (latents_numpy,))
-    vae_end = time.time()
-    end_profiling(profile_device)
-    if args.use_base_vae:
-        image = torch.from_numpy(images)
-        image = (image.detach().cpu() * 255.0).numpy()
-        images = image.round()
-    end_time = time.time()
-
-    avg_ms = 1000 * avg_ms / args.steps
-    clip_inf_time = (clip_inf_end - clip_inf_start) * 1000
-    vae_inf_time = (vae_end - vae_start) * 1000
-    total_time = end_time - start
-    print(f"\nAverage step time: {avg_ms}ms/it")
-    print(f"Clip Inference time (ms) = {clip_inf_time:.3f}")
-    print(f"VAE Inference time (ms): {vae_inf_time:.3f}")
-    print(f"\nTotal image generation time: {total_time}sec")
-
-    transform = T.ToPILImage()
-    pil_images = [
-        transform(image) for image in torch.from_numpy(images).to(torch.uint8)
-    ]
-
-    if args.output_dir is not None:
-        output_path = Path(args.output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-    else:
-        output_path = Path.cwd()
-    disk_space_check(output_path, lim=5)
-    for i in range(batch_size):
-        json_store = {
-            "prompt": args.prompts[i],
-            "negative prompt": args.negative_prompts[i],
-            "seed": args.seed,
-            "variant": args.variant,
-            "precision": args.precision,
-            "steps": args.steps,
-            "guidance_scale": args.guidance_scale,
-            "scheduler": args.scheduler,
-        }
-        prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[i][:15])
-        img_name = f"{prompt_slice}_{args.seed}_{i}_{dt.now().strftime('%y%m%d_%H%M%S')}"
-        pil_images[i].save(
-            output_path / f"{img_name}.jpg", quality=95, subsampling=0
-        )
-        with open(output_path / f"{img_name}.json", "w") as f:
-            f.write(json.dumps(json_store, indent=4))
+            output_path = Path.cwd()
+        disk_space_check(output_path, lim=5)
+        for i in range(batch_size):
+            json_store = {
+                "prompt": args.prompts[i],
+                "negative prompt": args.negative_prompts[i],
+                "seed": args.seed,
+                "variant": args.variant,
+                "precision": args.precision,
+                "steps": args.steps,
+                "guidance_scale": args.guidance_scale,
+                "scheduler": args.scheduler,
+            }
+            prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[i][:15])
+            img_name = f"{prompt_slice}_{args.seed}_{run}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+            pil_images[i].save(
+                output_path / f"{img_name}.jpg", quality=95, subsampling=0
+            )
+            with open(output_path / f"{img_name}.json", "w") as f:
+                f.write(json.dumps(json_store, indent=4))
diff --git a/shark/examples/shark_inference/stable_diffusion/stable_args.py b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -129,6 +129,13 @@ def path_expand(s):
     default=None,
     help="Directory path to save the output images and json",
 )
+
+p.add_argument(
+    "--runs",
+    type=int,
+    default=1,
+    help="number of images to be generated with random seeds in single execution",
+)
 ##############################################################################
 ### IREE - Vulkan supported flags
 ##############################################################################