mindspore-lab · hadipash · Oct 9, 2024 · Oct 9, 2024 · Oct 10, 2024 · Oct 10, 2024
@@ -193,7 +193,13 @@ Other useful documents and links are listed below.
    install [CANN 8.0.0.beta1](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.0.beta1)
    as recommended by the official installation website.
 
-2. Install requirements
+2. Install MindONE
+
+```shell
+pip install -e .[training]
+```
+
+3. Install requirements
 
 ```shell
 pip install -r requirements.txt
@@ -366,16 +372,16 @@ First, you will need to generate text embeddings with:
 ```shell
 # CLIP-Large
 TRANSFORMERS_OFFLINE=1 python scripts/v2.0/text_embedding.py \
---model.from_pretrained="DeepFloyd/t5-v1_1-xxl" \
---model.max_length=512 \
---prompts_file=YOUR_PROMPTS.txt \
---output_path=assets/texts/t5_512
-# T5
-TRANSFORMERS_OFFLINE=1 python scripts/v2.0/text_embedding.py \
 --model.from_pretrained="openai/clip-vit-large-patch14" \
 --model.max_length=77 \
 --prompts_file=YOUR_PROMPTS.txt \
 --output_path=assets/texts/clip_77
+# T5
+TRANSFORMERS_OFFLINE=1 python scripts/v2.0/text_embedding.py \
+--model.from_pretrained="DeepFloyd/t5-v1_1-xxl" \
+--model.max_length=512 \
+--prompts_file=YOUR_PROMPTS.txt \
+--output_path=assets/texts/t5_512
 ```
 
 Repeat the same for negative prompts.
@@ -384,10 +390,10 @@ Then, you can generate videos by running the following command:
 
 ```shell
 python scripts/v2.0/inference_v2.py --config=configs/opensora-v2-0/inference/256px.yaml \
-text_emb.t5_dir=assets/texts/t5_512 \
-text_emb.neg_t5_dir=assets/texts/t5_512_neg \
-text_emb.clip_dir=assets/texts/clip_77 \
-text_emb.neg_clip_dir=assets/texts/clip_77_neg
+prompts.t5_dir=assets/texts/t5_512 \
+prompts.neg_t5_dir=assets/texts/t5_512_neg \
+prompts.clip_dir=assets/texts/clip_77 \
+prompts.neg_clip_dir=assets/texts/clip_77_neg
 ```
 
 #### Inference Performance
@@ -616,8 +622,26 @@ video_embed_folder
 
 ## Training
 
+### Open-Sora 2.0
+
+Once the data in a CSV file is prepared, training can be started by running the appropriate bash scripts located in the
+`scripts/v2.0/run` directory.
+
+#### Training Performance
+
+| Model name | Cards | Batch size |            Mode            | JIT level | Method | Resolution | Frames | Sequence Parallel | ZeRO stage | VAE cache | Text Cache | Step time (s) |                         Recipe                         |
+|:----------:|:-----:|:----------:|:--------------------------:|:---------:|:------:|:----------:|:------:|:-----------------:|:----------:|:---------:|:----------:|:-------------:|:------------------------------------------------------:|
+|    11B     |   8   |     1      |           Graph            |    O1     |  t2v   |  256x256   |  129   |         -         |     3      |    Yes    |    Yes     |     4.58      | [yaml](configs/opensora-v2-0/train/stage1_latent.yaml) |
+|    11B     |   8   |     1      |          Pynative          |     -     |  t2v   |  256x256   |  129   |         -         |     3      |    Yes    |    Yes     |     6.57      | [yaml](configs/opensora-v2-0/train/stage1_latent.yaml) |
+|    11B     |   8   |     2      | MMDiT Graph + VAE Pynative |    O1     |  t2v   |  256x256   |  129   |         -         |     3      |    No     |    Yes     |     11.8      |    [yaml](configs/opensora-v2-0/train/stage1.yaml)     |
+|    11B     |   8   |     1      |           Graph            |    O1     |  t2v   |  768x768   |  129   |         8         |     3      |    Yes    |    Yes     |     16.5      | [yaml](configs/opensora-v2-0/train/stage2_latent.yaml) |
+|    11B     |   8   |     1      |          Pynative          |     -     |  t2v   |  768x768   |  129   |         8         |     3      |    Yes    |    Yes     |     18.9      | [yaml](configs/opensora-v2-0/train/stage2_latent.yaml) |
+
 ### Open-Sora 1.2
 
+<details>
+<summary>Instructions</summary>
+
 Once you prepare the data in a csv file, you may run the following commands to launch training on a single card.
 
 ```shell
@@ -675,6 +699,7 @@ More details on the bucket configuration can be found in [Multi-resolution Train
 
 The instruction for launching the dynamic training task is smilar to the previous section. An example running script is `scripts/run/run_train_os1.2_stage2.sh`.
 
+</details>
 
 ### Open-Sora 1.1
 

@@ -0,0 +1,8 @@
+from_pretrained: hpcai-tech/Open-Sora-v2/hunyuan_vae.safetensors
+in_channels: 3
+out_channels: 3
+layers_per_block: 2
+latent_channels: 16
+use_spatial_tiling: True
+use_temporal_tiling: False
+dtype: bf16
@@ -22,17 +22,9 @@ model:
   cond_embed: True
   dtype: bf16
 
-ae:
-  from_pretrained: hpcai-tech/Open-Sora-v2/hunyuan_vae.safetensors
-  in_channels: 3
-  out_channels: 3
-  layers_per_block: 2
-  latent_channels: 16
-  use_spatial_tiling: True
-  use_temporal_tiling: False
-  dtype: bf16
+ae: ../ae/hunyuan_vae.yaml
 
-text_emb:
+prompts:
   prompts:
   neg_prompts:
   t5_dir:
@@ -64,12 +56,11 @@ sampling_option:
   image_osci: True  # enable image guidance oscillation
   scale_temporal_osci: True
   method: i2v
-  motion_score: "4"  # motion score for video generation
+  motion_score: 4  # motion score for video generation
   batch_size: 1
-  cond_type: "t2v"
+  cond_type: t2v
 
 saving_option:
-  output_path: ../../../samples  # save directory
   fps: 24  # fps for video generation and saving
 
 # T2I. TODO: separate config
@@ -119,6 +110,6 @@ sampling_option_t2i:
   image_osci: True  # enable image guidance oscillation
   scale_temporal_osci: True
   method: distill
-  motion_score: "4"  # motion score for video generation
+  motion_score: 4  # motion score for video generation
   batch_size: 1
-  cond_type: "t2v"
+  cond_type: t2v
@@ -2,6 +2,8 @@ env:
   mode: 1
   debug: False
 
+enable_sequence_parallel: True
+
 model:
   from_pretrained: hpcai-tech/Open-Sora-v2/Open_Sora_v2.safetensors
   guidance_embed: False
@@ -22,17 +24,9 @@ model:
   cond_embed: True
   dtype: bf16
 
-ae:
-  from_pretrained: hpcai-tech/Open-Sora-v2/hunyuan_vae.safetensors
-  in_channels: 3
-  out_channels: 3
-  layers_per_block: 2
-  latent_channels: 16
-  use_spatial_tiling: True
-  use_temporal_tiling: False
-  dtype: bf16
+ae: ../ae/hunyuan_vae.yaml
 
-text_emb:
+prompts:
   prompts:
   neg_prompts:
   t5_dir:
@@ -64,12 +58,11 @@ sampling_option:
   image_osci: True  # enable image guidance oscillation
   scale_temporal_osci: True
   method: i2v
-  motion_score: "4"  # motion score for video generation
+  motion_score: 4  # motion score for video generation
   batch_size: 1
-  cond_type: "t2v"
+  cond_type: t2v
 
 saving_option:
-  output_path: ../../../samples  # save directory
   fps: 24  # fps for video generation and saving
 
 # T2I. TODO: separate config
@@ -119,6 +112,6 @@ sampling_option_t2i:
   image_osci: True  # enable image guidance oscillation
   scale_temporal_osci: True
   method: distill
-  motion_score: "4"  # motion score for video generation
+  motion_score: 4  # motion score for video generation
   batch_size: 1
-  cond_type: "t2v"
+  cond_type: t2v
@@ -0,0 +1,106 @@
+env:
+  mode: 0
+  jit_level: O1
+  max_device_memory: 59GB
+  seed: 42
+  distributed: False
+  debug: False
+
+model:
+  from_pretrained:
+  guidance_embed: False
+  fused_qkv: False
+  use_liger_rope: True
+  # model architecture
+  in_channels: 64
+  vec_in_dim: 768
+  context_in_dim: 4096
+  hidden_size: 3072
+  mlp_ratio: 4.0
+  num_heads: 24
+  depth: 19
+  depth_single_blocks: 38
+  axes_dim: [ 16, 56, 56 ]
+  theta: 10_000
+  qkv_bias: True
+  cond_embed: False
+  recompute_every_nth_block: 1
+  dtype: bf16
+
+ae: ../ae/hunyuan_vae.yaml
+
+dataset:
+  v2_pipeline: True
+  sample_n_frames: 5
+  csv_path: CSV_PATH
+  video_folder: VIDEO_FOLDER
+  text_emb_folder:
+    t5: UL2_FOLDER
+    clip: BYT5_FOLDER
+  empty_text_emb:
+    t5: EMPTY_TEXT_EMB
+    clip: EMPTY_TEXT_EMB
+  text_drop_prob:
+    t5: 0.31622777
+    clip: 0.31622777
+  vae_scale_factor: 0.476986
+  vae_shift_factor: 0
+  apply_transforms_dataset: True
+  output_columns: [ "video", "video_ids", "t5_caption", "txt_ids", "clip_caption", "shift_alpha" ]
+
+bucket_config:
+  init_args:
+    bucket_config:
+      256px:
+        1: [ 1.0, 50 ]
+      768px:
+        1: [ 0.5, 11 ]
+      1024px:
+        1: [ 0.5, 7 ]
+
+dataloader:
+  shuffle: True
+  num_workers_dataset: 4
+
+train:
+  pipeline:
+    is_causal_vae: True
+
+  sequence_parallel:
+    shards: 1   # 1 == no SP
+
+  options:
+    steps: 20000
+
+  lr_scheduler:
+    name: constant
+    lr: 1e-5
+    warmup_steps: 0
+
+  optimizer:
+    name: adamw_bf16
+    eps: 1e-15
+    betas: [ 0.9, 0.999 ]
+    weight_decay: 0
+
+  loss_scaler:
+    class_path: mindspore.nn.FixedLossScaleUpdateCell   # or DynamicLossScaleUpdateCell in FP16
+    init_args:
+      loss_scale_value: 1
+
+  settings:
+    zero_stage: 2
+    gradient_accumulation_steps: 1
+    clip_grad: True
+    clip_norm: 1.0
+
+  save:
+    ckpt_save_policy: latest_k
+    ckpt_save_interval: 500
+    ckpt_max_keep: 10
+    log_interval: 1
+    save_ema_only: False
+    record_lr: False
+
+save:
+  output_path: ../../../output/image  # the path is relative to this config