diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 8fffc853df49..ed4adc9bb414 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -237,19 +237,19 @@
     - local: api/models/consistency_decoder_vae
       title: ConsistencyDecoderVAE
     - local: api/models/transformer2d
-      title: Transformer2D
+      title: Transformer2DModel
     - local: api/models/pixart_transformer2d
-      title: PixArtTransformer2D
+      title: PixArtTransformer2DModel
     - local: api/models/dit_transformer2d
-      title: DiTTransformer2D
+      title: DiTTransformer2DModel
     - local: api/models/hunyuan_transformer_2d
       title: HunyuanDiT2DModel
     - local: api/models/transformer_temporal
-      title: Transformer Temporal
+      title: TransformerTemporalModel
     - local: api/models/prior_transformer
-      title: Prior Transformer
+      title: PriorTransformer
     - local: api/models/controlnet
-      title: ControlNet
+      title: ControlNetModel
     title: Models
     isExpanded: false
   - sections:
diff --git a/docs/source/en/api/models/controlnet.md b/docs/source/en/api/models/controlnet.md
index fe0c9ca1d8a6..b57620e1e414 100644
--- a/docs/source/en/api/models/controlnet.md
+++ b/docs/source/en/api/models/controlnet.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# ControlNet
+# ControlNetModel
 
 The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection.
 
diff --git a/docs/source/en/api/models/dit_transformer2d.md b/docs/source/en/api/models/dit_transformer2d.md
index 1bf48e3da984..afac62d53cb4 100644
--- a/docs/source/en/api/models/dit_transformer2d.md
+++ b/docs/source/en/api/models/dit_transformer2d.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# DiTTransformer2D
+# DiTTransformer2DModel
 
 A Transformer model for image-like data from [DiT](https://huggingface.co/papers/2212.09748).
 
diff --git a/docs/source/en/api/models/pixart_transformer2d.md b/docs/source/en/api/models/pixart_transformer2d.md
index 982122207a9c..5ddfabc618e5 100644
--- a/docs/source/en/api/models/pixart_transformer2d.md
+++ b/docs/source/en/api/models/pixart_transformer2d.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# PixArtTransformer2D
+# PixArtTransformer2DModel
 
 A Transformer model for image-like data from [PixArt-Alpha](https://huggingface.co/papers/2310.00426) and [PixArt-Sigma](https://huggingface.co/papers/2403.04692). 
 
diff --git a/docs/source/en/api/models/prior_transformer.md b/docs/source/en/api/models/prior_transformer.md
index 21b3918571de..3d4e3a81782c 100644
--- a/docs/source/en/api/models/prior_transformer.md
+++ b/docs/source/en/api/models/prior_transformer.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Prior Transformer
+# PriorTransformer
 
 The Prior Transformer was originally introduced in [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://huggingface.co/papers/2204.06125) by Ramesh et al. It is used to predict CLIP image embeddings from CLIP text embeddings; image embeddings are predicted through a denoising diffusion process.
 
diff --git a/docs/source/en/api/models/transformer2d.md b/docs/source/en/api/models/transformer2d.md
index a56a1379a27d..6440aa3dd1ba 100644
--- a/docs/source/en/api/models/transformer2d.md
+++ b/docs/source/en/api/models/transformer2d.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Transformer2D
+# Transformer2DModel
 
 A Transformer model for image-like data from [CompVis](https://huggingface.co/CompVis) that is based on the [Vision Transformer](https://huggingface.co/papers/2010.11929) introduced by Dosovitskiy et al. The [`Transformer2DModel`] accepts discrete (classes of vector embeddings) or continuous (actual embeddings) inputs.
 
diff --git a/docs/source/en/api/models/transformer_temporal.md b/docs/source/en/api/models/transformer_temporal.md
index ff5a497c03f5..02d075dea3f3 100644
--- a/docs/source/en/api/models/transformer_temporal.md
+++ b/docs/source/en/api/models/transformer_temporal.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Transformer Temporal
+# TransformerTemporalModel
 
 A Transformer model for video-like data.