diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 83f2eb88f88..c3ec8db0a19 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -300,6 +300,9 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
     """
     Constructs a Faster R-CNN model with a ResNet-50-FPN backbone.
 
+    Reference: `"Faster R-CNN: Towards Real-Time Object Detection with
+    Region Proposal Networks" <https://arxiv.org/abs/1506.01497>`_.
+
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index fd9a980b97d..79df3b450c4 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -278,6 +278,8 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
     """
     Constructs a Keypoint R-CNN model with a ResNet-50-FPN backbone.
 
+    Reference: `"Mask R-CNN" <https://arxiv.org/abs/1703.06870>`_.
+
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index ad8f356ad69..06b36d573ab 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -271,6 +271,8 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
     """
     Constructs a Mask R-CNN model with a ResNet-50-FPN backbone.
 
+    Reference: `"Mask R-CNN" <https://arxiv.org/abs/1703.06870>`_.
+
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 4dd95285dbc..c6e301c268c 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -569,6 +569,8 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
     """
     Constructs a RetinaNet model with a ResNet-50-FPN backbone.
 
+    Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
+
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.