diff --git a/configs/animal_detection/config_animal_detect_image_folder_template.yaml b/configs/animal_detection/config_animal_detect_image_folder_template.yaml
new file mode 100644
index 0000000..566b3e0
--- /dev/null
+++ b/configs/animal_detection/config_animal_detect_image_folder_template.yaml
@@ -0,0 +1,152 @@
+# =============================================================================
+# Configuration File for Batch Animal Detection from Image Folders
+# =============================================================================
+# This configuration is optimized for animal detection on image files directly
+# from a directory structure using MegaDetector models.
+# -----------------------------------------------------------------------------
+
+# ---------------------------
+# Model Configuration
+# ---------------------------
+# MegaDetector model for animal detection
+model:
+  weights: MDV6-yolov10-e-1280.pt      # MegaDetector model weights
+                                       # https://microsoft.github.io/CameraTraps/model_zoo/megadetector/
+
+# ---------------------------
+# Detection Parameters
+# ---------------------------
+confidence_threshold: 0.2   # Minimum confidence score for animal detections
+                            # MegaDetector typically uses 0.2 as default threshold
+                            # Lower values = more detections (including false positives)
+                            # Higher values = fewer, more confident detections
+
+image_size: 1280            # Input image size for the model (square format)
+                            # Common values: 640, 1024, 1280
+                            # Larger sizes may improve detection accuracy but increase processing time
+
+# ---------------------------
+# DataLoader Configurations
+# ---------------------------
+batch_size: 16              # Number of images per batch (adjust based on GPU memory)
+                            # Animal detection (especially MegaDetector) can be memory-intensive
+                            # Start with smaller batch sizes and increase if memory allows
+num_workers: 20             # Number of worker processes for data loading
+prefetch_factor: 8          # Number of batches prefetched by each worker
+
+# ---------------------------
+# Image Processing Settings
+# ---------------------------
+validate_images: false      # Set to true to validate all images can be opened with PIL
+                            # Slower startup but catches corrupted files
+
+# How to generate unique IDs from image file paths
+uuid_mode: filename         # Options:
+                            # - "filename": image001.jpg
+                            # - "relative": subfolder/image001.jpg
+                            # - "fullpath": /full/path/to/image001.jpg
+                            # - "hash": MD5 hash of full path
+
+# ---------------------------
+# Distributed Processing
+# ---------------------------
+evenly_distribute: true     # Distribute files based on size for load balancing
+stagger: false              # Stagger worker start times to reduce file system load
+
+# ---------------------------
+# Output Configurations
+# ---------------------------
+max_rows_per_file: 100000             # Maximum number of detection results per output file
+                                      # Animal detection results can be large due to multiple detections per image
+out_prefix: animal_detection_results  # Prefix for output files
+
+# =============================================================================
+# USAGE EXAMPLE:
+# =============================================================================
+# python animal_detect.py /path/to/images /path/to/output --input_type images --config config_animal_detect_image_folder_template.yaml
+# =============================================================================
+
+# =============================================================================
+# IMAGE DIRECTORY REQUIREMENTS:
+# =============================================================================
+# Your image directory can have any structure:
+#
+# Flat structure:
+# /images/
+#   ├── image001.jpg
+#   ├── image002.png
+#   └── image003.jpeg
+#
+# Nested structure:
+# /images/
+#   ├── category1/
+#   │   ├── img1.jpg
+#   │   └── img2.png
+#   └── category2/
+#       ├── img3.jpg
+#       └── img4.png
+#
+# Supported formats: .jpg, .jpeg, .png, .bmp, .tif, .tiff, .webp
+# All images are automatically converted to RGB mode for processing.
+#
+# UUID GENERATION MODES:
+# - filename: Good for flat directories with unique filenames
+# - relative: Good for nested directories where path info is important
+# - fullpath: Good when you need absolute path traceability
+# - hash: Good for very long paths or when you want anonymized IDs
+# =============================================================================
+
+# =============================================================================
+# OUTPUT FORMAT:
+# =============================================================================
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (based on uuid_mode)
+# - max_detection_score: Maximum confidence score across all detections (0.0 if no animals detected)
+# - num_detections: Total number of detections above threshold
+# - detections: JSON string containing detailed detection information
+#
+# Each detection includes:
+# - bbox: Absolute pixel coordinates [x1, y1, x2, y2]
+# - bbox_normalized: Normalized coordinates [0-1]
+# - confidence: Detection confidence score
+# - class_id: Numeric class ID (0=animal, 1=person, 2=vehicle for MegaDetector)
+# - class_name: Human-readable class name
+#
+# Files are saved in: {output_dir}/detections/rank_{rank}/
+# Example output:
+# animal_detection_results_rank_0_0.parquet
+# animal_detection_results_rank_0_1.parquet
+# ...
+# =============================================================================
+
+# =============================================================================
+# PERFORMANCE TUNING GUIDELINES:
+# =============================================================================
+# 
+# GPU Memory Optimization:
+# - Reduce batch_size if running out of GPU memory
+# - MegaDetector can be memory-intensive, especially at high resolutions
+# - Consider using smaller image_size if memory is limited
+#
+# CPU/I-O Optimization:
+# - Increase num_workers for faster data loading & prevent OOM crashes
+# - Increase prefetch_factor for better pipeline utilization
+#
+# Distributed Processing:
+# - Use evenly_distribute=true for better load balancing
+# - Set stagger=true if experiencing file system bottlenecks
+#
+# Detection Quality vs Speed:
+# - MegaDetector confidence_threshold of 0.2 is typically optimal based on repo documentation
+# - Lower thresholds may find more animals but increase false positives
+# - Higher image_size improves accuracy but slows processing
+# - Choose appropriate model based on accuracy vs speed needs:
+#   * MegaDetectorV6-Ultralytics-YoloV10-Extra: Most accurate, best for wildlife
+#   * YOLOv8 variants: General purpose object detection
+#
+# Model-Specific Notes:
+# - MegaDetector is specifically trained for wildlife camera trap images
+# - It detects animals and people with high accuracy
+# - Works well on images from outdoor/natural settings
+# - May not perform as well on indoor or urban animal images
+# =============================================================================
diff --git a/configs/animal_detection/config_animal_detect_parquet_template.yaml b/configs/animal_detection/config_animal_detect_parquet_template.yaml
new file mode 100644
index 0000000..d541220
--- /dev/null
+++ b/configs/animal_detection/config_animal_detect_parquet_template.yaml
@@ -0,0 +1,191 @@
+# =============================================================================
+# Configuration File for Batch Animal Detection from Parquet Files
+# =============================================================================
+# This configuration is optimized for animal detection on Parquet files containing
+# encoded image data with metadata using MegaDetector models.
+# -----------------------------------------------------------------------------
+
+# ---------------------------
+# Model Configuration
+# ---------------------------
+# MegaDetector model for animal detection
+model:
+  weights: md_v5a.0.0.pt      # MegaDetector model weights
+                              # Options:
+                              # - md_v5a.0.0.pt (MegaDetector v5a, recommended)
+                              # - md_v5b.0.0.pt (MegaDetector v5b, alternative)
+                              # - yolov8n.pt (YOLOv8 nano, general purpose)
+                              # - yolov8s.pt (YOLOv8 small, balanced)
+                              # - yolov8m.pt (YOLOv8 medium, more accurate)
+                              # - yolov8l.pt (YOLOv8 large, most accurate)
+                              # Custom trained models are also supported
+                              # For MegaDetector models, see:
+                              # https://github.com/microsoft/CameraTraps
+
+# ---------------------------
+# Detection Parameters
+# ---------------------------
+confidence_threshold: 0.2   # Minimum confidence score for animal detections
+                            # MegaDetector typically uses 0.2 as default threshold
+                            # Lower values = more detections (including false positives)
+                            # Higher values = fewer, more confident detections
+
+image_size: 1280            # Input image size for the model (square format)
+                            # MegaDetector v5 typically uses 1280
+                            # Common values: 640, 1024, 1280
+                            # Larger sizes may improve detection accuracy but increase processing time
+
+# ---------------------------
+# DataLoader Configurations
+# ---------------------------
+batch_size: 8               # Number of images per batch (adjust based on GPU memory)
+                            # Animal detection (especially MegaDetector) can be memory-intensive
+                            # Start with smaller batch sizes and increase if memory allows
+num_workers: 24             # Number of worker processes for data loading
+prefetch_factor: 8          # Number of batches prefetched by each worker
+
+# ---------------------------
+# Parquet-Specific Settings
+# ---------------------------
+read_batch_size: 64         # Number of rows to read from Parquet at a time
+                            # Smaller than embedding tasks due to larger detection output per image
+
+# Columns to read from Parquet files (must exist in your data)
+read_columns:
+  - uuid              # [REQUIRED] Unique identifier for each image
+  - image             # [REQUIRED] Encoded image bytes (JPEG, PNG, etc.)
+  - original_size     # [OPTIONAL] Original image dimensions
+  - resized_size      # [OPTIONAL] Resized image dimensions
+
+# ---------------------------
+# Distributed Processing
+# ---------------------------
+evenly_distribute: true     # Distribute files based on size for load balancing
+stagger: false              # Stagger worker start times to reduce I/O congestion
+
+# ---------------------------
+# Output Configurations
+# ---------------------------
+max_rows_per_file: 5000             # Maximum number of detection results per output file
+                                    # Animal detection results can be large due to multiple detections per image
+out_prefix: animal_detection_results  # Prefix for output files
+
+# =============================================================================
+# USAGE EXAMPLES:
+# =============================================================================
+# 
+# For Parquet files:
+# python animal_detect.py /path/to/parquet_dir /path/to/output --input_type parquet --config config_animal_detect_parquet_template.yaml
+#
+# With file list:
+# python animal_detect.py /path/to/parquet_dir /path/to/output --input_type parquet --file_list files.txt --config config_animal_detect_parquet_template.yaml
+# =============================================================================
+
+# =============================================================================
+# PARQUET DATA REQUIREMENTS:
+# =============================================================================
+# Your Parquet files must contain:
+# 1. 'uuid' column: Unique string identifier for each image
+# 2. 'image' column: Image data encoded as bytes (from PIL Image.save() to BytesIO)
+# 3. Optional metadata columns as specified in read_columns
+#
+# Example of creating compatible Parquet data:
+# ```python
+# import io
+# from PIL import Image
+# import pandas as pd
+# import pyarrow.parquet as pq
+# 
+# # Encode image to bytes
+# img = Image.open('image.jpg')
+# img_bytes = io.BytesIO()
+# img.save(img_bytes, format='JPEG')
+# img_bytes = img_bytes.getvalue()
+# 
+# # Create DataFrame
+# df = pd.DataFrame({
+#     'uuid': ['img_001'],
+#     'image': [img_bytes],
+#     'original_size': [(1024, 768)],
+#     'resized_size': [(1280, 1280)]
+# })
+# 
+# # Save to Parquet
+# df.to_parquet('images.parquet')
+# ```
+# =============================================================================
+
+# =============================================================================
+# OUTPUT FORMAT:
+# =============================================================================
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (from input Parquet)
+# - max_detection_score: Maximum confidence score across all detections (0.0 if no animals detected)
+# - num_detections: Total number of detections above threshold
+# - detections: JSON string containing detailed detection information
+#
+# Each detection in the JSON includes:
+# - bbox: Absolute pixel coordinates [x1, y1, x2, y2]
+# - bbox_normalized: Normalized coordinates [0-1]
+# - confidence: Detection confidence score
+# - class_id: Numeric class ID (0=animal, 1=person, 2=vehicle for MegaDetector)
+# - class_name: Human-readable class name
+#
+# Files are saved in: {output_dir}/detections/rank_{rank}/
+# Example output:
+# animal_detection_results_rank_0_0.parquet
+# animal_detection_results_rank_0_1.parquet
+# ...
+#
+# Example detection JSON structure:
+# [
+#   {
+#     "bbox": [120.5, 80.2, 340.8, 290.1],
+#     "bbox_normalized": [0.118, 0.078, 0.333, 0.284],
+#     "confidence": 0.85,
+#     "class_id": 0,
+#     "class_name": "animal"
+#   },
+#   {
+#     "bbox": [450.0, 200.0, 600.0, 400.0],
+#     "bbox_normalized": [0.440, 0.195, 0.586, 0.391],
+#     "confidence": 0.72,
+#     "class_id": 1,
+#     "class_name": "person"
+#   }
+# ]
+# =============================================================================
+
+# =============================================================================
+# PERFORMANCE TUNING GUIDELINES:
+# =============================================================================
+# 
+# GPU Memory Optimization:
+# - Reduce batch_size if running out of GPU memory
+# - MegaDetector can be memory-intensive, especially at high resolutions
+# - Consider using smaller image_size if memory is limited
+#
+# CPU/I-O Optimization:
+# - Increase num_workers for faster data loading (but watch CPU usage)
+# - Increase prefetch_factor for better pipeline utilization
+# - Adjust read_batch_size for optimal Parquet I/O performance
+#
+# Distributed Processing:
+# - Use evenly_distribute=true for better load balancing
+# - Set stagger=true if experiencing I/O bottlenecks
+#
+# Detection Quality vs Speed:
+# - MegaDetector confidence_threshold of 0.2 is typically optimal
+# - Lower thresholds may find more animals but increase false positives
+# - Higher image_size improves accuracy but slows processing
+# - Choose appropriate model based on accuracy vs speed needs:
+#   * md_v5a.0.0.pt: Best for wildlife/camera trap images
+#   * YOLOv8 variants: General purpose object detection
+#
+# Model-Specific Notes:
+# - MegaDetector is specifically trained for wildlife camera trap images
+# - It detects animals, people, and vehicles with high accuracy
+# - Works well on images from outdoor/natural settings
+# - May not perform as well on indoor or urban animal images
+# - Provides both detection and rough classification capabilities
+# =============================================================================
diff --git a/configs/config_embed_image_folder_template.yaml b/configs/embed/config_embed_image_folder_template.yaml
similarity index 100%
rename from configs/config_embed_image_folder_template.yaml
rename to configs/embed/config_embed_image_folder_template.yaml
diff --git a/configs/config_embed_parquet_template.yaml b/configs/embed/config_embed_parquet_template.yaml
similarity index 100%
rename from configs/config_embed_parquet_template.yaml
rename to configs/embed/config_embed_parquet_template.yaml
diff --git a/configs/face_detection/config_face_detect_image_folder_template.yaml b/configs/face_detection/config_face_detect_image_folder_template.yaml
new file mode 100644
index 0000000..73351a5
--- /dev/null
+++ b/configs/face_detection/config_face_detect_image_folder_template.yaml
@@ -0,0 +1,142 @@
+# =============================================================================
+# Configuration File for Batch Face Detection from Image Folders
+# =============================================================================
+# This configuration is optimized for face detection on image files directly
+# from a directory structure using YOLO models.
+# -----------------------------------------------------------------------------
+
+# ---------------------------
+# Model Configuration
+# ---------------------------
+# YOLO model for face detection
+model:
+  weights: yolov8n-face.pt  # YOLO face detection model weights
+                           # Options:
+                           # - yolov8n-face.pt (fastest, least accurate)
+                           # - yolov8s-face.pt (balanced)
+                           # - yolov8m-face.pt (more accurate)
+                           # - yolov8l-face.pt (most accurate, slowest)
+                           # Custom trained models are also supported
+                           # For more models, see:
+                           # https://github.com/akanametov/yolo-face?tab=readme-ov-file#models
+
+# ---------------------------
+# Detection Parameters
+# ---------------------------
+confidence_threshold: 0.5   # Minimum confidence score for face detections
+                            # Lower values = more detections (including false positives)
+                            # Higher values = fewer, more confident detections
+
+image_size: 1024            # Input image size for YOLO model (square format)
+                            # Larger sizes may improve detection accuracy but increase processing time
+
+# ---------------------------
+# Image Processing Settings
+# ---------------------------
+image_size: 1024            # Input image size for YOLO model (square)
+                            # Common values: 640, 1024, 1280
+                            # Larger sizes = more accurate but slower
+
+# ---------------------------
+# DataLoader Configurations
+# ---------------------------
+batch_size: 16          # Number of images per batch (adjust based on GPU memory)
+num_workers: 28         # Number of worker processes for data loading
+prefetch_factor: 16     # Number of batches prefetched by each worker
+
+# ---------------------------
+# Image Processing Settings
+# ---------------------------
+validate_images: false  # Set to true to validate all images can be opened with PIL
+                        # Slower startup but catches corrupted files
+
+# How to generate unique IDs from image file paths
+uuid_mode: filename     # Options:
+                        # - "filename": image001.jpg
+                        # - "relative": subfolder/image001.jpg
+                        # - "fullpath": /full/path/to/image001.jpg
+                        # - "hash": MD5 hash of full path
+
+# ---------------------------
+# Distributed Processing
+# ---------------------------
+evenly_distribute: true  # Distribute files based on size for load balancing
+stagger: false           # Stagger worker start times to reduce file system load
+
+# ---------------------------
+# Output Configurations
+# ---------------------------
+max_rows_per_file: 10000         # Maximum number of detection results per output file
+out_prefix: face_detection_results  # Prefix for output files
+
+# =============================================================================
+# USAGE EXAMPLE:
+# =============================================================================
+# python face_detect.py /path/to/images /path/to/output --input_type images --config config_face_detect_image_folder_template.yaml
+# =============================================================================
+
+# =============================================================================
+# IMAGE DIRECTORY REQUIREMENTS:
+# =============================================================================
+# Your image directory can have any structure:
+#
+# Flat structure:
+# /images/
+#   ├── image001.jpg
+#   ├── image002.png
+#   └── image003.jpeg
+#
+# Nested structure:
+# /images/
+#   ├── category1/
+#   │   ├── img1.jpg
+#   │   └── img2.png
+#   └── category2/
+#       ├── img3.jpg
+#       └── img4.png
+#
+# Supported formats: .jpg, .jpeg, .png, .bmp, .tif, .tiff, .webp
+# All images are automatically converted to RGB mode for processing.
+#
+# UUID GENERATION MODES:
+# - filename: Good for flat directories with unique filenames
+# - relative: Good for nested directories where path info is important
+# - fullpath: Good when you need absolute path traceability
+# - hash: Good for very long paths or when you want anonymized IDs
+# =============================================================================
+
+# =============================================================================
+# OUTPUT FORMAT:
+# =============================================================================
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (based on uuid_mode)
+# - detection_score: Maximum confidence score for face detection (0.0 if no faces detected)
+#
+# Files are saved in: {output_dir}/detections/rank_{rank}/
+# Example output:
+# face_detection_results_rank_0_0.parquet
+# face_detection_results_rank_0_1.parquet
+# ...
+# =============================================================================
+
+# =============================================================================
+# PERFORMANCE TUNING GUIDELINES:
+# =============================================================================
+# 
+# GPU Memory Optimization:
+# - Reduce batch_size if running out of GPU memory
+# - Face detection is typically less memory-intensive than embedding
+#
+# CPU/I-O Optimization:
+# - Increase num_workers for faster data loading (but watch CPU usage)
+# - Increase prefetch_factor for better pipeline utilization
+#
+# Distributed Processing:
+# - Use evenly_distribute=true for better load balancing
+# - Set stagger=true if experiencing file system bottlenecks
+#
+# Detection Quality vs Speed:
+# - Lower confidence_threshold = more detections but more false positives
+# - Higher confidence_threshold = fewer but more reliable detections
+# - Choose appropriate YOLO model size based on accuracy vs speed needs
+# =============================================================================
diff --git a/configs/face_detection/config_face_detect_parquet_template.yaml b/configs/face_detection/config_face_detect_parquet_template.yaml
new file mode 100644
index 0000000..cbd5b1f
--- /dev/null
+++ b/configs/face_detection/config_face_detect_parquet_template.yaml
@@ -0,0 +1,151 @@
+# =============================================================================
+# Configuration File for Batch Face Detection from Parquet Files
+# =============================================================================
+# This configuration is optimized for face detection on Parquet files containing
+# encoded image data with metadata using YOLO models.
+# -----------------------------------------------------------------------------
+
+# ---------------------------
+# Model Configuration
+# ---------------------------
+# YOLO model for face detection
+model:
+  weights: yolov8n-face.pt  # YOLO face detection model weights
+                           # Options:
+                           # - yolov8n-face.pt (fastest, least accurate)
+                           # - yolov8s-face.pt (balanced)
+                           # - yolov8m-face.pt (more accurate)
+                           # - yolov8l-face.pt (most accurate, slowest)
+                           # Custom trained models are also supported
+                           # For more models, see:
+                           # https://github.com/akanametov/yolo-face?tab=readme-ov-file#models
+# ---------------------------
+# Detection Parameters
+# ---------------------------
+confidence_threshold: 0.5   # Minimum confidence score for face detections
+                            # Lower values = more detections (including false positives)
+                            # Higher values = fewer, more confident detections
+
+image_size: 1024            # Input image size for YOLO model (square format)
+                            # Larger sizes may improve detection accuracy but increase processing time
+
+# ---------------------------
+# Image Processing Settings
+# ---------------------------
+image_size: 1024            # Input image size for YOLO model (square)
+                            # Common values: 640, 1024, 1280
+                            # Larger sizes = more accurate but slower
+
+# ---------------------------
+# DataLoader Configurations
+# ---------------------------
+batch_size: 16          # Number of images per batch (adjust based on GPU memory)
+num_workers: 28         # Number of worker processes for data loading
+prefetch_factor: 16     # Number of batches prefetched by each worker
+
+# ---------------------------
+# Parquet-Specific Settings
+# ---------------------------
+read_batch_size: 128    # Number of rows to read from Parquet at a time
+                        # Larger values = more memory usage but potentially faster I/O
+
+# Columns to read from Parquet files (must exist in your data)
+read_columns:
+  - uuid              # [REQUIRED] Unique identifier for each image
+  - image             # [REQUIRED] Encoded image bytes (JPEG, PNG, etc.)
+  - original_size     # [OPTIONAL] Original image dimensions
+  - resized_size      # [OPTIONAL] Resized image dimensions
+
+# ---------------------------
+# Distributed Processing
+# ---------------------------
+evenly_distribute: true  # Distribute files based on size for load balancing
+stagger: false           # Stagger worker start times to reduce I/O congestion
+
+# ---------------------------
+# Output Configurations
+# ---------------------------
+max_rows_per_file: 10000         # Maximum number of detection results per output file
+out_prefix: face_detection_results  # Prefix for output files
+
+# =============================================================================
+# USAGE EXAMPLES:
+# =============================================================================
+# 
+# For Parquet files:
+# python face_detect.py /path/to/parquet_dir /path/to/output --input_type parquet --config config_face_detect_parquet_template.yaml
+#
+# With file list:
+# python face_detect.py /path/to/parquet_dir /path/to/output --input_type parquet --file_list files.txt --config config_face_detect_parquet_template.yaml
+# =============================================================================
+
+# =============================================================================
+# PARQUET DATA REQUIREMENTS:
+# =============================================================================
+# Your Parquet files must contain:
+# 1. 'uuid' column: Unique string identifier for each image
+# 2. 'image' column: Image data encoded as bytes (from PIL Image.save() to BytesIO)
+# 3. Optional metadata columns as specified in read_columns
+#
+# Example of creating compatible Parquet data:
+# ```python
+# import io
+# from PIL import Image
+# import pandas as pd
+# import pyarrow.parquet as pq
+# 
+# # Encode image to bytes
+# img = Image.open('image.jpg')
+# img_bytes = io.BytesIO()
+# img.save(img_bytes, format='JPEG')
+# img_bytes = img_bytes.getvalue()
+# 
+# # Create DataFrame
+# df = pd.DataFrame({
+#     'uuid': ['img_001'],
+#     'image': [img_bytes],
+#     'original_size': [(1024, 768)],
+#     'resized_size': [(640, 640)]
+# })
+# 
+# # Save to Parquet
+# df.to_parquet('images.parquet')
+# ```
+# =============================================================================
+
+# =============================================================================
+# OUTPUT FORMAT:
+# =============================================================================
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (from input Parquet)
+# - detection_score: Maximum confidence score for face detection (0.0 if no faces detected)
+#
+# Files are saved in: {output_dir}/detections/rank_{rank}/
+# Example output:
+# face_detection_results_rank_0_0.parquet
+# face_detection_results_rank_0_1.parquet
+# ...
+# =============================================================================
+
+# =============================================================================
+# PERFORMANCE TUNING GUIDELINES:
+# =============================================================================
+# 
+# GPU Memory Optimization:
+# - Reduce batch_size if running out of GPU memory
+# - Face detection is typically less memory-intensive than embedding
+#
+# CPU/I-O Optimization:
+# - Increase num_workers for faster data loading (but watch CPU usage)
+# - Increase prefetch_factor for better pipeline utilization
+# - Increase read_batch_size for faster Parquet I/O
+#
+# Distributed Processing:
+# - Use evenly_distribute=true for better load balancing
+# - Set stagger=true if experiencing I/O bottlenecks
+#
+# Detection Quality vs Speed:
+# - Lower confidence_threshold = more detections but more false positives
+# - Higher confidence_threshold = fewer but more reliable detections
+# - Choose appropriate YOLO model size based on accuracy vs speed needs
+# =============================================================================
diff --git a/docs-requirements.txt b/docs-requirements.txt
index 797f127..a6bd7ef 100644
--- a/docs-requirements.txt
+++ b/docs-requirements.txt
@@ -2,3 +2,4 @@ mkdocs>=1.5.0
 mkdocs-material>=9.0.0
 mkdocstrings[python]>=0.20.0
 pymdown-extensions>=10.0.0
+mkdocs-macros-plugin
\ No newline at end of file
diff --git a/docs/animal-detection-guide.md b/docs/animal-detection-guide.md
new file mode 100644
index 0000000..24f0472
--- /dev/null
+++ b/docs/animal-detection-guide.md
@@ -0,0 +1,645 @@
+# Animal Detection Guide
+
+This guide covers how to perform animal detection on large image datasets using MegaDetector[^1] [^2]. MegaDetector is specifically trained for wildlife camera trap images.
+
+[^1]: Beery, S., Morris, D., & Yang, S. MegaDetector. GitHub repository: https://github.com/agentmorris/MegaDetector
+
+[^2]: Microsoft AI for Earth. CameraTraps: Tools for training and running detectors and classifiers for wildlife camera trap images. GitHub repository: https://github.com/microsoft/CameraTraps
+
+## Overview
+
+The animal detection module provides high-performance batch processing of images for wildlife detection using MegaDetector. It's designed for HPC environments with GPU acceleration and distributed processing capabilities.
+
+### Key Features
+
+- **MegaDetector Support**: Optimized for Microsoft's MegaDetector models (specifically designed for wildlife camera trap images)
+- **Multiple Input Types**: Process image directories or Parquet datasets
+- **Distributed Processing**: Multi-GPU and multi-node support via SLURM
+- **Efficient Data Pipeline**: Optimized data loading with prefetching and parallel processing
+- **Comprehensive Output**: Detailed detection results with confidence scores and bounding boxes
+- **Performance Monitoring**: Built-in profiling and resource usage tracking
+
+## Quick Start
+
+Create a working directory at home directory and change current directory. 
+
+```bash
+mkdir ~/animal_detection
+cd ~/animal_detection
+```
+
+### Install Dependency
+
+First, follow the [Getting Started guide](getting-started.md) to set up your environment and install the base `hpc-inference` package.
+
+Once you have the base installation, install the detection-specific dependencies:
+
+```bash
+# Activate your virtual environment
+source ~/venvs/hpc_inference/bin/activate
+
+# Install detection dependencies
+uv pip install "hpc-inference[detection]"
+```
+This will install additional packages required for animal detection.
+
+### Download Example Data
+
+The **ENA24 Detection Dataset**[^3] is a wildlife camera trap dataset collected from Estonia across multiple sites. The dataset was created as part of the Efficient Neural Architecture Design (ENA24) detection challenge and features diverse wildlife species in natural forest environments. For this guide, we use the **balanced subset** available on [Hugging Face](https://huggingface.co/datasets/imageomics/IDLE-OO-Camera-Traps), which provides a curated selection of 1,120 high-quality images with improved class distribution.
+
+[^3]: Yousif H, Kays R, Zhihai H. Dynamic Programming Selection of Object Proposals for Sequence-Level Animal Species Classification in the Wild. IEEE Transactions on Circuits and Systems for Video Technology, 2019.
+
+=== "Python"
+
+    ```python
+    from pathlib import Path
+    from huggingface_hub import snapshot_download
+
+    # Download the ENA24 subset (~1GB)
+    data_dir = "~/animal_detection/camera_trap_data"
+
+    snapshot_download(
+        repo_id="imageomics/IDLE-OO-Camera-Traps",
+        repo_type="dataset",
+        local_dir=data_dir,
+        allow_patterns=["data/test/ENA24/*"]
+    )
+
+    # Set up paths  
+    image_dir = Path(data_dir) / "data/test/ENA24"
+    print(f"Dataset downloaded to: {image_dir}")
+    print(f"Number of images: {len(list(image_dir.glob('*.png')))}")
+    ```
+
+=== "Bash"
+
+    ```bash
+    # Install huggingface_hub if not already installed
+    # pip install huggingface_hub
+
+    # Download only the ENA24 subset
+    huggingface-cli download imageomics/IDLE-OO-Camera-Traps \
+        --repo-type dataset \
+        --local-dir ~/animal_detection/camera_trap_data \
+        --include "data/test/ENA24/*"
+
+    # Verify download
+    echo "Dataset downloaded to: camera_trap_data/data/test/ENA24"
+    echo "Number of images: $(find ~/animal_detection/camera_trap_data -name '*.png' -o -name '*.PNG' | wc -l)"
+    ```
+
+### Download Model Weights
+
+Check out the [Model Zoo](https://microsoft.github.io/CameraTraps/model_zoo/megadetector/) to compare different Megadetector model releases. In this post, we'll use the most capable model `MDV6-yolov10-e`. 
+
+```bash
+cd ~/animal_detection
+mkdir -p model/megadetector
+wget -O model/megadetector/MDV6-yolov10-e-1280.pt "https://zenodo.org/records/15398270/files/MDV6-yolov10-e-1280.pt?download=1"
+```
+
+**MegaDetector Class Categories:**
+
+- `0`: Animal
+- `1`: Person
+- `2`: Vehicle
+
+
+### Basic Usage
+
+Let's perform face detection inference using `MDV6-yolov10-e`. We'll specify the target & output directory, input types & model weights location. Please make sure your machine has GPU available when you run this example.
+
+```bash
+# Using command line arguments
+python -m hpc_inference.inference.detection.animal_detect \
+    $HOME/animal_detection/camera_trap_data/data/test/ENA24 \
+    $HOME/animal_detection/detection \
+    --input_type images \
+    --model_weights "$HOME/animal_detection/model/megadetector/MDV6-yolov10-e-1280.pt"
+```
+
+Check detection output:
+```bash
+tree $HOME/animal_detection/detection
+```
+```
+animal_detection/detection
+├── detections
+│   └── rank_0
+│       └── animal_detection_results_rank_0_0.parquet
+└── profile_results
+    └── rank_0
+        ├── computing_specs.json
+        ├── cpu_gpu_usage_plot.png
+        ├── gpu_mem_usage_plot.png
+        ├── profile_log.csv
+        └── usage_log.csv
+```
+
+We'll go through script arguments and detection output in details in the following section. 
+
+
+## Input Data Requirements
+
+### Image Directory Structure
+
+The script supports flexible directory structures:
+
+**Flat Structure:**
+```
+/images/
+├── image001.jpg
+├── image002.png
+└── image003.jpeg
+```
+
+**Nested Structure:**
+```
+/images/
+├── category1/
+│   ├── img1.jpg
+│   └── img2.png
+└── category2/
+    ├── img3.jpg
+    └── img4.png
+```
+
+**Supported Image Formats:**
+
+- JPEG (.jpg, .jpeg)
+- PNG (.png)
+- BMP (.bmp)
+- TIFF (.tif, .tiff)
+- WebP (.webp)
+
+All images are automatically converted to RGB mode for processing.
+
+### Parquet Input
+
+The module can also process compressed parquet image dataset. 
+
+```bash
+python -m hpc_inference.inference.detection.animal_detect \
+    /path/to/parquet/files \
+    /path/to/output \
+    --input_type parquet \
+    --config your_config.yaml
+```
+Learn more about the parquet file specific configuration in the config section. 
+
+## Configuration
+
+### Using Configuration Files (Recommended)
+
+Configuration files provide better reproducibility and easier parameter management. 
+
+You can find documented template for config specification in `configs/animal_detection`
+
+<details>
+<summary>Image Folder Dataset Config Template</summary>
+</br>
+
+{{ include_file_as_code("configs/animal_detection/config_animal_detect_image_folder_template.yaml", "yaml") }}
+
+</details>
+
+<details>
+<summary>Parquet Image Dataset Config Template</summary>
+</br>
+
+{{ include_file_as_code("configs/animal_detection/config_animal_detect_parquet_template.yaml", "yaml") }}
+
+</details>
+
+
+
+#### Model Configuration
+```yaml
+model:
+  weights: MDV6-yolov10-e-1280.pt  # MegaDetector model weights location
+```
+
+#### Detection Parameters
+```yaml
+confidence_threshold: 0.2  # Minimum confidence for detections
+image_size: 1280          # Input image size (square format)
+```
+
+#### Performance Settings
+```yaml
+batch_size: 16        # Images per batch (adjust for GPU memory)
+num_workers: 20       # Data loading workers
+prefetch_factor: 8    # Prefetch batches per worker
+```
+
+#### Image Processing
+```yaml
+validate_images: false  # Validate images with PIL (slower but safer)
+uuid_mode: filename    # How to generate unique IDs
+evenly_distribute: true # Balance load across workers
+```
+
+### UUID Generation Modes
+
+Control how unique identifiers are generated for images:
+
+- **`filename`**: `image001.jpg` - Good for flat directories
+- **`relative`**: `subfolder/image001.jpg` - Good for nested directories
+- **`fullpath`**: `/full/path/to/image001.jpg` - Full traceability
+- **`hash`**: MD5 hash of path - Anonymous IDs
+
+
+## HPC/SLURM Usage
+
+If you are working on HPC, you can use the SLURM templates in `scripts/animal_detection` to schedule batch detection jobs and potentially scale across multiple GPUs. 
+
+<details>
+<summary>Image Folder Dataset SLURM Template</summary>
+</br>
+
+{{ include_file_as_code("scripts/animal_detection/animal_detect_image_folder_template.slurm", "slurm") }}
+
+</details>
+
+<details>
+<summary>Parquet Image Dataset SLURM Template</summary>
+</br>
+
+{{ include_file_as_code("scripts/animal_detection/animal_detect_parquet_template.slurm", "slurm") }}
+
+</details>
+
+### Single Process Job
+
+Let's create a config file and submit the detection task as a SLURM job using one GPU.
+
+Create a `config` folder and an empty config file:
+```bash
+cd ~/animal_detection
+mkdir configs
+nano configs/animal_detect_config.yaml
+```
+Paste the content below to `configs/animal_detect_config.yaml`
+```yaml
+model:
+    weights: model/megadetector/MDV6-yolov10-e-1280.pt # Use absolute path if possible
+
+confidence_threshold: 0.2
+image_size: 1280
+
+batch_size: 16
+num_workers: 20
+prefetch_factor: 8
+
+validate_images: true
+uuid_mode: filename
+evenly_distribute: true
+```
+
+Now create a `log` directory to store job logs
+```bash
+mkdir -p ~/animal_detection/logs
+```
+
+Save and exit (in nano: `Ctrl+X`, then `Y`, then `Enter`). Check the file was created correctly
+```bash
+cat ~/animal_detection/configs/animal_detect_config.yaml
+```
+Now create a `scripts` folder and an empty SLURM script
+```bash
+cd ~/animal_detection
+mkdir scripts
+nano scripts/animal_detect_single_process.slurm
+```
+Paste the content below to `configs/animal_detect_config.yaml`. Make sure to replace the `partition` & `account` based on your HPC credentials. Also replace the virtual environment placeholder with the absolute path to your virtual environment. 
+```bash
+#!/bin/bash
+#SBATCH --job-name=animal_detect_single_process
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 
+#SBATCH --gpus-per-task=1
+#SBATCH --cpus-per-task=48
+#SBATCH --partition=YOUR_CLUSTER_PARTITION
+#SBATCH --time=00:10:00
+#SBATCH --output=logs/animal_detect_single_process_%j.out
+#SBATCH --error=logs/animal_detect_single_process_%j.err
+#SBATCH --account=YOUR_ACCOUNT_NUMBER
+
+module load cuda/12.4.1
+source PATH/TO/YOUR/VENV/bin/activate
+
+TARGET_DIR="$HOME/animal_detection/camera_trap_data/data/test/ENA24"
+OUTPUT_DIR="$HOME/animal_detection/detection_single_process"
+CONFIG_FILE="$HOME/animal_detection/configs/animal_detect_config.yaml"
+
+
+srun python -m hpc_inference.inference.detection.animal_detect \
+    "${TARGET_DIR}" \
+    "${OUTPUT_DIR}" \
+    --input_type images \
+    --config "${CONFIG_FILE}" 
+```
+Save and exit (in nano: `Ctrl+X`, then `Y`, then `Enter`). Check the file was created correctly
+```bash
+cat ~/animal_detection/scripts/animal_detect_single_process.slurm
+```
+Using this SLURM script, we'll perform the inference pipeline on **one** compute node allocated with 1 GPU & 48 CPUs. Now let's submit the SLURM job
+
+```bash
+cd ~/animal_detetction
+sbatch scripts/animal_detect_single_process.slurm
+```
+
+### Multi-processes Distributed Processing
+
+For large-scale datasets containing millions of images, you should scale by increasing the number of processes. Depending on your computing resource, this could be done in several ways:
+
+**Multiple GPUs per Node**
+
+If your nodes have multiple GPUs and sufficient CPU cores:
+
+```bash
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2    # 2 processes per node
+#SBATCH --gpus-per-task=1      # 1 GPU per process
+#SBATCH --cpus-per-task=48     # 48 CPU per process
+```
+
+**Multiple Nodes**
+
+If you only have one GPU per node or don't have enough CPUs for each process, you can simply increase the number of nodes
+
+```bash
+#SBATCH --nodes=4              # 4 nodes total
+#SBATCH --ntasks-per-node=1    # 1 process per node
+#SBATCH --gpus-per-task=1      # 1 GPU per process
+#SBATCH --cpus-per-task=48     # Full node resources
+```
+
+Let's create a multi-processes SLURM job script by increasing the number nodes:
+```bash
+nano scripts/animal_detect_multi_processes.slurm
+```
+```bash
+#!/bin/bash
+#SBATCH --job-name=animal_detect_multi_processes
+#SBATCH --nodes=4    
+#SBATCH --ntasks-per-node=1 
+#SBATCH --gpus-per-task=1
+#SBATCH --cpus-per-task=48
+#SBATCH --partition=YOUR_CLUSTER_PARTITION
+#SBATCH --time=00:10:00
+#SBATCH --output=logs/animal_detect_multi_processes%j.out
+#SBATCH --error=logs/animal_detect_multi_processes%j.err
+#SBATCH --account=YOUR_ACCOUNT_NUMBER
+
+module load cuda/12.4.1
+source PATH/TO/YOUR/VENV/bin/activate
+
+TARGET_DIR="$HOME/animal_detection/camera_trap_data/data/test/ENA24"
+OUTPUT_DIR="$HOME/animal_detection/detection_multi_processes"
+CONFIG_FILE="$HOME/animal_detection/configs/animal_detect_config.yaml"
+
+
+srun python -m hpc_inference.inference.detection.animal_detect \
+    "${TARGET_DIR}" \
+    "${OUTPUT_DIR}" \
+    --input_type images \
+    --config "${CONFIG_FILE}" 
+```
+Using this SLURM script, we'll perform the inference pipeline on **4** compute nodes allocated with 1 GPU & 48 CPUs per node. Now let's submit the SLURM job
+
+```bash
+cd ~/animal_detetction
+sbatch scripts/animal_detect_multi_processes.slurm
+```
+
+
+## Output Format
+
+### Directory Structure
+
+Here's the example output directory structure for single-process detection.
+```
+output_dir/
+├── detections/
+│   └── rank_0/
+│       ├── animal_detection_results_rank_0_0.parquet
+│       ├── animal_detection_results_rank_0_1.parquet
+│       └── ...
+│       └── animal_detection_results_rank_0_n.parquet
+└── profile_results/
+    └── rank_0/
+        ├── computing_specs.json
+        ├── cpu_gpu_usage_plot.png
+        ├── gpu_mem_usage_plot.png
+        ├── profile_log.csv
+        └── usage_log.csv
+```
+For multiple-processes jobs, we'll simply see more ranks, where the files within stays consistent:
+```
+output_dir/
+├── detections/
+│   └── rank_0/
+│   └── ...
+│   └── rank_n/
+└── profile_results/
+│   └── rank_0/
+│   └── ...
+│   └── rank_n/
+```
+
+### Detection Results Schema
+
+Each Parquet file contains detection results with these columns:
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `uuid` | String | Unique identifier for each image |
+| `max_detection_score` | Float | Maximum confidence score across all detections (0.0 if no animals detected) |
+| `num_detections` | Integer | Total number of detections above threshold |
+| `detections` | JSON String | Detailed detection information for each detected object |
+
+Each detection object in the JSON includes:
+```json
+{
+  "bbox": [x1, y1, x2, y2],          // Absolute pixel coordinates
+  "bbox_normalized": [x1, y1, x2, y2], // Normalized coordinates (0-1)
+  "confidence": 0.85,                 // Detection confidence score
+  "class_id": 0,                     // Numeric class ID
+  "class_name": "animal"             // Human-readable class name
+}
+```
+
+### Example Output
+
+```python
+import pandas as pd
+import json
+
+# Load results
+df = pd.read_parquet("~/animal_detection/detection/detections/rank_0/animal_detection_results_rank_0_0.parquet")
+
+# View summary
+print(f"Processed {len(df)} images")
+print(f"Found animals in {sum(df['num_detections'] > 0)} images")
+print(f"Average detections per image: {df['num_detections'].mean():.2f}")
+```
+```
+Processed 1120 images
+Found animals in 1116 images
+Average detections per image: 1.10
+```
+```python
+# Parse detections for first image with animals
+first_detection = df[df['num_detections'] > 0].iloc[0]
+detections = json.loads(first_detection['detections'])
+print(f"Image {first_detection['uuid']} has {len(detections)} detections:")
+for detection in detections:
+    print(f"  {detection['class_name']}: {detection['confidence']:.3f}")
+```
+```
+[{'bbox': [188.17922973632812,
+   656.734619140625,
+   443.3848876953125,
+   794.3240966796875],
+  'bbox_normalized': [0.14701502323150634,
+   0.5130739212036133,
+   0.3463944435119629,
+   0.6205657005310059],
+  'confidence': 0.8981033563613892,
+  'class_id': 0,
+  'class_name': 'animal'}]
+```
+
+Megadetector detected one animal with high confident 0.8981. Let's use the utility function to plot the detection box on top of the original image to validate the detection result:
+
+```python
+from hpc_inference.utils.visualization import plot_detections_matplotlib
+
+image_uuid = first_detection['uuid']
+image_path = f"camera_trap_data/data/test/ENA24/{image_uuid}"
+
+image = Image.open(image_path)
+detections = json.loads(first_detection['detections'])
+
+plot_detections_matplotlib(
+    image, detections,
+    letterbox_shape=(1280, 1280),
+    title=f"Detections for {image_uuid}"
+)
+```
+![Original vs Processed Beetle Images](imgs/detection_box.png)
+
+!!!note
+    Notice we specified `letterbox_shape=(1280, 1280)` for the `letterbox_shape` argument. This matches the default `image_size: 1280` used during preprocessing for MegaDetector in our script. 
+    
+    Always keep track of the preprocessing image size and specify the same dimensions when plotting detection boxes on the original image to ensure accurate bounding box positioning.
+
+
+## Performance Optimization & Troubleshooting
+
+### Common Issues & Solutions
+
+#### **GPU Memory Issues**
+
+**Symptoms:**
+
+- CUDA out of memory errors
+- Sudden job termination
+- GPU memory warnings
+
+**Solutions:**
+```yaml
+# Reduce GPU memory usage
+batch_size: 8          # Start small and increase gradually
+image_size: 1024       # Use smaller input size
+```
+
+- Monitor GPU usage: `nvidia-smi` during processing
+- Check profile results after job completion for memory patterns
+
+#### **CPU/DataLoader OOM Crashes**
+
+**Symptoms:**
+
+- Workers being killed during data loading
+- "DataLoader worker (pid XXXX) is killed by signal" errors
+- Job hangs during data preprocessing
+
+**Solutions:**
+```yaml
+# Reduce memory pressure in data loading
+num_workers: 8         # Fewer parallel workers
+prefetch_factor: 2     # Less prefetching
+evenly_distribute: true # Better load balancing
+```
+
+If possible, use more processors to reduce the risk of OOM crashing. 
+```bash
+#SBATCH --ntasks-per-node=64  # Use 64 cores per task, increase RAM
+```
+
+
+#### **Model Loading Errors**
+
+**Symptoms:**
+
+- "Model file not found" errors
+- CUDA compatibility issues
+- Ultralytics import errors
+
+**Solutions:**
+
+- Verify model weights file exists and is accessible
+- Check CUDA version compatibility
+- Ensure dependencies
+- Pre-download models to avoid network issues during job execution
+
+#### **Detection Quality Issues**
+
+**Symptoms:**
+
+- Too many false positives
+- Missing obvious animals
+- Poor detection accuracy
+
+**Tuning Parameters:**
+```yaml
+# Balance quality vs speed
+confidence_threshold: 0.6  # Increase the threshold for positive detection -> increase precision
+image_size: 1280           # Higher resolution for better accuracy
+model:
+  weights: MDV6-yolov10-e-1280.pt  # Better model
+```
+
+**Quality vs Speed Trade-offs:**
+
+- **Higher confidence_threshold**: Fewer false positives, may miss some animals
+- **Lower confidence_threshold**: More detections, high recall but more false positives
+- **Larger image_size**: Better accuracy, slower processing
+- **Smaller image_size**: Faster processing, may miss small animals
+
+### Performance Monitoring & Best Practices
+
+**Built-in Monitoring:**
+
+- **Real-time logging**: Progress updates and throughput metrics
+- **Performance profiles**: Detailed timing analysis in `profile_results/`
+- **Resource tracking**: CPU, GPU, and memory utilization plots
+- **Batch statistics**: Per-batch timing and detection counts
+
+**Optimization Workflow:**
+
+1. **Start small**: Test with 100-1000 images first
+2. **Monitor resources**: Watch GPU/CPU utilization during test run
+3. **Analyze profiles**: Check `profile_results/` for bottlenecks
+4. **Tune parameters**: Adjust based on resource utilization
+5. **Scale up**: Apply successful settings to full dataset
+
+**Configuration Management:**
+
+- **Save working configs**: Keep track of successful parameter combinations
+- **Version control**: Use git to track configuration changes
+- **Document settings**: Note dataset-specific optimizations
+
diff --git a/docs/imgs/detection_box.png b/docs/imgs/detection_box.png
new file mode 100644
index 0000000..b1d5af5
Binary files /dev/null and b/docs/imgs/detection_box.png differ
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..53dcb93
--- /dev/null
+++ b/main.py
@@ -0,0 +1,30 @@
+import os
+
+def define_env(env):
+    """Define custom macros for MkDocs."""
+
+    @env.macro
+    def include_file_as_code(file_path, language="markdown"):
+        """
+        Include the content of a file within a code block.
+
+        Args:
+            file_path (str): The path to the file to include, relative to the project root.
+            language (str): The language identifier for syntax highlighting.
+
+        Returns:
+            str: A Markdown-formatted code block containing the file's content.
+        """
+        full_path = os.path.join(env.project_dir, file_path)
+        try:
+            with open(full_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+        except FileNotFoundError:
+            content = f"**Error:** The file `{file_path}` was not found."
+
+        # Escape triple backticks in content to prevent breaking the code block
+        content = content.replace("```", "```\u200b")
+
+        line_nums_string = "{ py linenums='1' }"
+
+        return f"```{language} {line_nums_string}\n{content}\n```"
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 46a3969..eaebce9 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -10,6 +10,7 @@ nav:
   - Getting Started: getting-started.md
   - User Guide:
     - ImageFolder Dataset: imagefolder-guide.md
+    - Animal Detction: animal-detection-guide.md
     - GPU Utilization Metrics: utilization-metrics.md
   - API Reference: api-reference.md
 
@@ -44,19 +45,34 @@ theme:
     - navigation.expand
     - navigation.top
     - search.highlight
+    - content.tabs.link
 
 markdown_extensions:
   - admonition
+  - attr_list
+  - footnotes
+  - md_in_html
+  - pymdownx.tabbed:
+      alternate_style: true
+  - pymdownx.betterem
+  - pymdownx.blocks.caption
   - pymdownx.details
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
   - pymdownx.superfences
+  - pymdownx.tasklist
+  - pymdownx.tilde
+  - pymdownx.keys
   - pymdownx.highlight:
       anchor_linenums: true
-  - pymdownx.inlinehilite
-  - pymdownx.snippets
+      line_spans: __span
+      pygments_lang_class: true
   - pymdownx.arithmatex:
       generic: true
   - toc:
       permalink: true
+      title: 📖 On This Page
+
 
 extra_javascript:
   - https://polyfill.io/v3/polyfill.min.js?features=es6
@@ -64,6 +80,7 @@ extra_javascript:
 
 plugins:
   - search
+  - macros
   - mkdocstrings:
       handlers:
         python:
diff --git a/pyproject.toml b/pyproject.toml
index 5619ca2..6aab7ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,9 +8,11 @@ dynamic = ["version"]
 description = "High-performance computing solution for efficient batch inference on large-scale image datasets"
 readme = "README.md"
 license = "MIT"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 authors = [
     { name = "Net Zhang", email = "zhang.11091@osu.edu" },
+    { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
+    { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" }
 ]
 keywords = [
     "machine-learning",
@@ -25,8 +27,6 @@ classifiers = [
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -45,6 +45,7 @@ dependencies = [
     "numpy>=1.21.0",           # Basic numpy operations
     "matplotlib>=3.5.0",       # Profiling plots and visualizations
     "psutil>=5.9.0",           # System resource monitoring
+    "pynvml>=11.5.0",          # GPU monitoring and profiling
     "tqdm>=4.64.0",            # Progress bars for profiling
 ]
 
@@ -56,10 +57,15 @@ openclip = [
 
 # Future model dependencies
 
+detection = [
+    "ultralytics>=8.0.0"
+]
+
 
 # Combined inference extras
 inference = [
     "hpc-inference[openclip]",  # Add other models as you create them
+    "hpc-inference[detection]",
 ]
 
 # All features
@@ -84,11 +90,11 @@ docs = [
 ]
 
 [project.urls]
-Homepage = "https://github.com/yourusername/hpc-inference"
-Documentation = "https://hpc-inference.readthedocs.io"
-Repository = "https://github.com/yourusername/hpc-inference"
-Changelog = "https://github.com/yourusername/hpc-inference/blob/main/CHANGELOG.md"
-"Bug Tracker" = "https://github.com/yourusername/hpc-inference/issues"
+Homepage = "https://github.com/Imageomics/hpc-inference"
+Documentation = "https://imageomics.github.io/hpc-inference/"
+Repository = "https://github.com/Imageomics/hpc-inference"
+Changelog = "https://github.com/Imageomics/hpc-inference/blob/main/CHANGELOG.md"
+"Bug Tracker" = "https://github.com/Imageomics/hpc-inference/issues"
 
 [tool.hatch.version]
 path = "src/hpc_inference/__init__.py"
@@ -109,7 +115,7 @@ include = [
 
 # Development tools configuration
 [tool.ruff]
-target-version = "py38"
+target-version = "py310"
 line-length = 88
 select = ["E", "F", "I", "N", "W", "UP"]
 ignore = ["E501", "F401"]
@@ -120,10 +126,10 @@ ignore = ["E501", "F401"]
 
 [tool.black]
 line-length = 88
-target-version = ["py38"]
+target-version = ["py310"]
 
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.10"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = true
diff --git a/scripts/animal_detection/animal_detect_image_folder_template.slurm b/scripts/animal_detection/animal_detect_image_folder_template.slurm
new file mode 100644
index 0000000..0b22800
--- /dev/null
+++ b/scripts/animal_detection/animal_detect_image_folder_template.slurm
@@ -0,0 +1,146 @@
+#!/bin/bash
+#SBATCH --job-name=animal_detect_images_job    # [REQUIRED] Set a descriptive job name
+#SBATCH --nodes=NUM_NODES                      # [REQUIRED] Number of nodes to use
+#SBATCH --ntasks-per-node=TASKS_PER_NODE       # [RECOMMENDED] Number of tasks per node
+#SBATCH --gpus-per-task=1                      # [REQUIRED] Number of GPUs per task (set to 1)
+#SBATCH --cpus-per-task=CPUS_PER_TASK          # [RECOMMENDED] Number of CPU cores per task (e.g., 48)
+#SBATCH --partition=PARTITION_NAME             # [REQUIRED] Partition/queue name (e.g., gpu, gpu-exp)
+#SBATCH --time=HH:MM:SS                        # [REQUIRED] Walltime limit (e.g., 8:00:00)
+#SBATCH --output=logs/animal_detect_images_%j.out  # [OPTIONAL] Stdout log file (%j = job ID)
+#SBATCH --error=logs/animal_detect_images_%j.err   # [OPTIONAL] Stderr log file
+#SBATCH --account=ACCOUNT_NAME                 # [REQUIRED] Project account for allocation
+#SBATCH --mail-type=ALL                        # [OPTIONAL] Email notifications (BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=YOUR_EMAIL@domain.edu      # [OPTIONAL] Email address for notifications
+
+# === Load modules and activate environment ===
+module load cuda/VERSION                       # [REQUIRED] Load CUDA module (e.g., cuda/12.4.1)
+source /path/to/your/venv/bin/activate         # [REQUIRED] Activate your Python virtual environment
+
+# === Ensure package is installed ===
+# Make sure hpc-inference package with YOLO dependencies is installed
+# pip install 'hpc-inference[yolo]' or pip install ultralytics
+which python                                   # Print Python path for debugging
+
+# === Set data paths ===
+TARGET_DIR="/path/to/your/image_directory"     # [REQUIRED] Directory containing input images
+OUTPUT_DIR="/path/to/your/output_dir"          # [REQUIRED] Directory to save animal detection results
+
+# === Choose your configuration method ===
+# Option 1: Use config file (RECOMMENDED for production)
+CONFIG_FILE="/path/to/your/animal_detect_config.yaml"  # Path to YAML config file
+
+srun python -m hpc_inference.inference.detection.animal_detect \
+    "${TARGET_DIR}" \
+    "${OUTPUT_DIR}" \
+    --input_type images \
+    --config "${CONFIG_FILE}"
+
+# Option 2: Use command line arguments (for quick testing)
+# Uncomment and modify the lines below, comment out the config version above
+#
+# srun python -m hpc_inference.inference.detection.animal_detect \
+#     "${TARGET_DIR}" \
+#     "${OUTPUT_DIR}" \
+#     --input_type images \
+#     --model_weights "md_v5a.0.0.pt" \
+#     --confidence_threshold 0.2 \
+#     --image_size 1280 \
+#     --batch_size 8 \
+#     --num_workers 24 \
+#     --prefetch_factor 8 \
+#     --max_rows_per_file 5000 \
+#     --out_prefix "animal_detection_results" \
+#     --uuid_mode filename \
+#     --evenly_distribute \
+#     --validate_images
+
+
+# -------------------------------
+# ANIMAL DETECTION IMAGE FOLDER-SPECIFIC PARAMETERS
+# -------------------------------
+# --input_type images:     Tells the script to process image files from a directory
+# --model_weights:         Animal detection model file:
+#                          Download available models from:
+#                          https://microsoft.github.io/CameraTraps/model_zoo/megadetector
+#                          MegaDetectorV6-Ultralytics-YoloV10-Extra is recommended for best performance
+#                          
+# --confidence_threshold:  Minimum confidence score for animal detections (0.0-1.0)
+#                          MegaDetector typically uses 0.2 as optimal threshold
+#                          Lower = more detections (including false positives)
+#                          Higher = fewer, more confident detections
+# --image_size:            Input image size for the model (square format)
+#                          MegaDetector v5 typically uses 640 or 1280
+#                          Larger sizes improve accuracy but slow processing
+# --uuid_mode:             How to generate unique IDs from image paths:
+#                          - "filename": Use just the filename (image001.jpg)
+#                          - "relative": Use relative path from TARGET_DIR (subfolder/image001.jpg)
+#                          - "fullpath": Use full absolute path (/full/path/to/image001.jpg)
+#                          - "hash": Use MD5 hash of the full path (a1b2c3d4e5f6g7h8)
+# --validate_images:       [OPTIONAL] Validate that all images can be opened with PIL
+#                          Slower but safer - catches corrupted files before processing
+# --file_list:             NOT applicable for image folders (will cause error)
+
+# SUPPORTED IMAGE FORMATS:
+# .jpg, .jpeg, .png, .bmp, .tif, .tiff, .webp
+# Images are automatically converted to RGB mode for animal detection
+
+# DIRECTORY STRUCTURE:
+# TARGET_DIR can contain:
+# - Flat structure: /images/img1.jpg, /images/img2.jpg, ...
+# - Nested structure: /images/site1/img1.jpg, /images/site2/img2.jpg, ...
+# All .jpg, .jpeg, .png, etc. files will be found recursively
+
+# OUTPUT FORMAT:
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (based on uuid_mode)
+# - max_detection_score: Maximum confidence score across all detections (0.0 if no animals detected)
+# - num_detections: Total number of detections above threshold
+# - detections: JSON string with detailed detection information including:
+#   * bbox: Absolute pixel coordinates [x1, y1, x2, y2]
+#   * bbox_normalized: Normalized coordinates [0-1]
+#   * confidence: Detection confidence score
+#   * class_id: Numeric class ID (0=animal, 1=person, 2=vehicle for MegaDetector)
+#   * class_name: Human-readable class name
+# Files are saved in: {OUTPUT_DIR}/detections/rank_{rank}/
+
+# -------------------------------
+# SLURM Template Field Explanations
+# -------------------------------
+# --job-name:          Name for your job in the queue/monitoring system.
+# --nodes:             Number of nodes to allocate for the job.
+# --gpus-per-task:     Number of GPUs per task (set to 1 unless using model parallelism).
+# --cpus-per-task:     Number of CPU cores per task (should match or exceed your data loader workers).
+# --ntasks-per-node:   Number of parallel tasks per node.
+#                      For animal detection, balance between available GPUs and I/O capacity.
+# --partition:         Cluster partition/queue to submit to (e.g., gpu, gpu-exp).
+# --time:              Maximum walltime for the job (format: HH:MM:SS).
+#                      Animal detection can be slower than face detection, allow more time.
+# --output:            Path for standard output log file (use %j for job ID).
+# --error:             Path for standard error log file.
+# --account:           Your allocation/project account for resource usage.
+
+# PERFORMANCE TIPS FOR ANIMAL DETECTION ON IMAGE FOLDERS:
+# - Animal detection (especially MegaDetector) is more memory-intensive than face detection
+# - Start with smaller batch sizes and increase if memory allows
+# - if encountering OOM errors in data loaders:
+#   - Reduce number of workers (--num_workers)
+#   - Reduce prefetch factor (--prefetch_factor)
+#   - Increase CPUs per task (--cpus-per-task)
+# - Use --evenly_distribute for better load balancing when file sizes vary
+# - Use --validate_images if you suspect corrupted files (adds startup time)
+# - Consider --uuid_mode based on your downstream analysis needs
+
+# MEGADETECTOR MODEL NOTES:
+# - Pytorch Wildlife model zoos: https://microsoft.github.io/CameraTraps/model_zoo/megadetector/
+# - MegaDetector is specifically designed for wildlife camera trap images
+# - Trained on millions of camera trap images from around the world
+# - Detects animals and people with high accuracy
+# - Works best on outdoor/natural settings
+# - May not perform as well on indoor pets or zoo animals
+# - Models will be automatically downloaded on first use
+# - Ensure internet connectivity or pre-download models to avoid delays
+
+# MODEL DOWNLOAD AND CACHING:
+# - Models will be automatically downloaded to ~/.cache/ultralytics/ on first use
+# - Consider pre-downloading models before job execution:
+#   wget https://zenodo.org/records/15398270/files/MDV6-yolov10-e-1280.pt?download=1
diff --git a/scripts/animal_detection/animal_detect_parquet_template.slurm b/scripts/animal_detection/animal_detect_parquet_template.slurm
new file mode 100644
index 0000000..2a04dd1
--- /dev/null
+++ b/scripts/animal_detection/animal_detect_parquet_template.slurm
@@ -0,0 +1,181 @@
+#!/bin/bash
+#SBATCH --job-name=animal_detect_parquet_job   # [REQUIRED] Set a descriptive job name
+#SBATCH --nodes=NUM_NODES                      # [REQUIRED] Number of nodes to use
+#SBATCH --ntasks-per-node=TASKS_PER_NODE       # [RECOMMENDED] Number of tasks per node
+#SBATCH --gpus-per-task=1                      # [REQUIRED] Number of GPUs per task (set to 1)
+#SBATCH --cpus-per-task=CPUS_PER_TASK          # [RECOMMENDED] Number of CPU cores per task (e.g., 48)
+#SBATCH --partition=PARTITION_NAME             # [REQUIRED] Partition/queue name (e.g., gpu, gpu-exp)
+#SBATCH --time=HH:MM:SS                        # [REQUIRED] Walltime limit (e.g., 8:00:00)
+#SBATCH --output=logs/animal_detect_parquet_%j.out  # [OPTIONAL] Stdout log file (%j = job ID)
+#SBATCH --error=logs/animal_detect_parquet_%j.err   # [OPTIONAL] Stderr log file
+#SBATCH --account=ACCOUNT_NAME                 # [REQUIRED] Project account for allocation
+#SBATCH --mail-type=ALL                        # [OPTIONAL] Email notifications (BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=YOUR_EMAIL@domain.edu      # [OPTIONAL] Email address for notifications
+
+# === Load modules and activate environment ===
+module load cuda/VERSION                       # [REQUIRED] Load CUDA module (e.g., cuda/12.4.1)
+source /path/to/your/venv/bin/activate         # [REQUIRED] Activate your Python virtual environment
+
+# === Ensure package is installed ===
+# Make sure hpc-inference package with YOLO dependencies is installed
+# pip install 'hpc-inference[yolo]' or pip install ultralytics
+which python                                   # Print Python path for debugging
+
+# === Set data paths ===
+TARGET_DIR="/path/to/your/parquet_files"       # [REQUIRED] Directory containing input Parquet files
+OUTPUT_DIR="/path/to/your/output_dir"          # [REQUIRED] Directory to save animal detection results
+
+# === Choose your configuration method ===
+# Option 1: Use config file (RECOMMENDED for production)
+CONFIG_FILE="/path/to/your/animal_detect_parquet_config.yaml"  # Path to YAML config file
+FILE_LIST="/path/to/your/file_list.txt"        # [OPTIONAL] File with list of Parquet files
+
+srun python -m hpc_inference.inference.detection.animal_detect \
+    "${TARGET_DIR}" \
+    "${OUTPUT_DIR}" \
+    --input_type parquet \
+    --config "${CONFIG_FILE}" \
+    --file_list "${FILE_LIST}"  # [OPTIONAL] Use this if you have a specific list of files to process
+
+# Option 2: Use command line arguments (for quick testing)
+# Uncomment and modify the lines below, comment out the config version above
+#
+# srun python -m hpc_inference.inference.detection.animal_detect \
+#     "${TARGET_DIR}" \
+#     "${OUTPUT_DIR}" \
+#     --input_type parquet \
+#     --model_weights "md_v5a.0.0.pt" \
+#     --confidence_threshold 0.2 \
+#     --image_size 1280 \
+#     --batch_size 8 \
+#     --num_workers 24 \
+#     --prefetch_factor 8 \
+#     --read_batch_size 64 \
+#     --max_rows_per_file 5000 \
+#     --out_prefix "animal_detection_results" \
+#     --read_columns uuid image original_size resized_size \
+#     --evenly_distribute
+
+
+# -------------------------------
+# ANIMAL DETECTION PARQUET-SPECIFIC PARAMETERS
+# -------------------------------
+# --input_type parquet:    Tells the script to process Parquet files containing encoded images
+# --model_weights:         Animal detection model file:
+#                          - "md_v5a.0.0.pt" (MegaDetector v5a, recommended for wildlife)
+#                          - "md_v5b.0.0.pt" (MegaDetector v5b, alternative)
+#                          - "yolov8n.pt" (YOLOv8 nano, fastest)
+#                          - "yolov8s.pt" (YOLOv8 small, balanced)
+#                          - "yolov8m.pt" (YOLOv8 medium, more accurate)
+#                          - "yolov8l.pt" (YOLOv8 large, most accurate)
+# --confidence_threshold:  Minimum confidence score for animal detections (0.0-1.0)
+#                          MegaDetector typically uses 0.2 as optimal threshold
+#                          Lower = more detections (including false positives)
+#                          Higher = fewer, more confident detections
+# --image_size:            Input image size for the model (square format)
+#                          MegaDetector v5 typically uses 1280
+#                          Larger sizes improve accuracy but slow processing
+# --read_batch_size:       Number of rows to read from Parquet files at once
+#                          Smaller than face detection due to larger output per image
+# --read_columns:          Columns to read from Parquet files (space-separated)
+#                          REQUIRED: uuid, image
+#                          OPTIONAL: original_size, resized_size, etc.
+# --file_list:             [OPTIONAL] Path to text file containing list of Parquet files to process
+#                          Useful for processing specific subsets of data
+# --evenly_distribute:     Distribute files evenly based on size for better load balancing
+# --stagger:               [OPTIONAL] Stagger worker start times to reduce I/O congestion
+
+# PARQUET DATA REQUIREMENTS:
+# Your Parquet files must contain:
+# 1. 'uuid' column: Unique string identifier for each image
+# 2. 'image' column: Image data encoded as bytes (from PIL Image.save() to BytesIO)
+# 3. Optional metadata columns as specified in --read_columns
+
+# OUTPUT FORMAT:
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (from input Parquet)
+# - max_detection_score: Maximum confidence score across all detections (0.0 if no animals detected)
+# - num_detections: Total number of detections above threshold
+# - detections: JSON string with detailed detection information including:
+#   * bbox: Absolute pixel coordinates [x1, y1, x2, y2]
+#   * bbox_normalized: Normalized coordinates [0-1]
+#   * confidence: Detection confidence score
+#   * class_id: Numeric class ID (0=animal, 1=person, 2=vehicle for MegaDetector)
+#   * class_name: Human-readable class name
+# Files are saved in: {OUTPUT_DIR}/detections/rank_{rank}/
+
+# -------------------------------
+# SLURM Template Field Explanations
+# -------------------------------
+# --job-name:          Name for your job in the queue/monitoring system.
+# --nodes:             Number of nodes to allocate for the job.
+# --gpus-per-task:     Number of GPUs per task (set to 1 unless using model parallelism).
+# --cpus-per-task:     Number of CPU cores per task (should match or exceed your data loader workers).
+# --ntasks-per-node:   Number of parallel tasks per node.
+#                      For animal detection, balance between available GPUs and I/O capacity.
+# --partition:         Cluster partition/queue to submit to (e.g., gpu, gpu-exp).
+# --time:              Maximum walltime for the job (format: HH:MM:SS).
+#                      Animal detection can be slower than face detection, allow more time.
+# --output:            Path for standard output log file (use %j for job ID).
+# --error:             Path for standard error log file.
+# --account:           Your allocation/project account for resource usage.
+
+# PERFORMANCE TIPS FOR ANIMAL DETECTION ON PARQUET:
+# - Animal detection (especially MegaDetector) is more memory-intensive than face detection
+# - Start with smaller batch sizes (8 instead of 16) and increase if memory allows
+# - Use --evenly_distribute for better load balancing across workers
+# - Adjust --read_batch_size for optimal I/O performance (64 is conservative)
+# - Use --file_list to process specific subsets of your data efficiently
+# - MegaDetector confidence_threshold of 0.2 is typically optimal for wildlife images
+# - Choose model based on use case:
+#   * MegaDetector: Best for wildlife/camera trap images
+#   * YOLOv8: General purpose, good for various animal contexts
+# - Consider --stagger if experiencing I/O bottlenecks during startup
+
+# MEGADETECTOR MODEL NOTES:
+# - MegaDetector is specifically designed for wildlife camera trap images
+# - Trained on millions of camera trap images from around the world
+# - Detects animals, people, and vehicles with high accuracy
+# - Works best on outdoor/natural settings
+# - May not perform as well on indoor pets or zoo animals
+# - Models will be automatically downloaded on first use
+# - Ensure internet connectivity or pre-download models to avoid delays
+
+# MODEL DOWNLOAD AND CACHING:
+# - Models will be automatically downloaded to ~/.cache/ultralytics/ on first use
+# - MegaDetector models are larger than standard YOLO models (~400MB vs ~50MB)
+# - Consider pre-downloading models before job execution:
+#   python -c "from ultralytics import YOLO; YOLO('md_v5a.0.0.pt')"
+
+# FILE_LIST FORMAT (if using --file_list):
+# Create a text file with one Parquet file path per line:
+# /path/to/data/file1.parquet
+# /path/to/data/file2.parquet
+# /path/to/data/file3.parquet
+
+# EXAMPLE DETECTION OUTPUT:
+# For each image, the detections JSON will contain:
+# [
+#   {
+#     "bbox": [120.5, 80.2, 340.8, 290.1],
+#     "bbox_normalized": [0.118, 0.078, 0.333, 0.284],
+#     "confidence": 0.85,
+#     "class_id": 0,
+#     "class_name": "animal"
+#   },
+#   {
+#     "bbox": [450.0, 200.0, 600.0, 400.0],
+#     "bbox_normalized": [0.440, 0.195, 0.586, 0.391],
+#     "confidence": 0.72,
+#     "class_id": 1,
+#     "class_name": "person"
+#   }
+# ]
+
+# CAMERA TRAP WORKFLOW CONSIDERATIONS:
+# - MegaDetector is optimized for camera trap scenarios
+# - Handles various lighting conditions and weather
+# - Good at distinguishing animals from vegetation movement
+# - Reduces false positives from shadows, leaves, etc.
+# - Provides both detection and rough classification
+# - Ideal for wildlife monitoring and ecological research
diff --git a/scripts/face_detection/face_detect_image_folder_template.slurm b/scripts/face_detection/face_detect_image_folder_template.slurm
new file mode 100644
index 0000000..a44d48d
--- /dev/null
+++ b/scripts/face_detection/face_detect_image_folder_template.slurm
@@ -0,0 +1,121 @@
+#!/bin/bash
+#SBATCH --job-name=face_detect_images_job      # [REQUIRED] Set a descriptive job name
+#SBATCH --nodes=NUM_NODES                      # [REQUIRED] Number of nodes to use
+#SBATCH --ntasks-per-node=TASKS_PER_NODE       # [RECOMMENDED] Number of tasks per node
+#SBATCH --gpus-per-task=1                      # [REQUIRED] Number of GPUs per task (set to 1)
+#SBATCH --cpus-per-task=CPUS_PER_TASK          # [RECOMMENDED] Number of CPU cores per task (e.g., 48)
+#SBATCH --partition=PARTITION_NAME             # [REQUIRED] Partition/queue name (e.g., gpu, gpu-exp)
+#SBATCH --time=HH:MM:SS                        # [REQUIRED] Walltime limit (e.g., 6:00:00)
+#SBATCH --output=logs/face_detect_images_%j.out  # [OPTIONAL] Stdout log file (%j = job ID)
+#SBATCH --error=logs/face_detect_images_%j.err   # [OPTIONAL] Stderr log file
+#SBATCH --account=ACCOUNT_NAME                 # [REQUIRED] Project account for allocation
+#SBATCH --mail-type=ALL                        # [OPTIONAL] Email notifications (BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=YOUR_EMAIL@domain.edu      # [OPTIONAL] Email address for notifications
+
+# === Load modules and activate environment ===
+module load cuda/VERSION                       # [REQUIRED] Load CUDA module (e.g., cuda/12.4.1)
+source /path/to/your/venv/bin/activate         # [REQUIRED] Activate your Python virtual environment
+
+# === Ensure package is installed ===
+# Make sure hpc-inference package with YOLO dependencies is installed
+# pip install 'hpc-inference[yolo]' or pip install ultralytics
+which python                                   # Print Python path for debugging
+
+# === Set data paths ===
+TARGET_DIR="/path/to/your/image_directory"     # [REQUIRED] Directory containing input images
+OUTPUT_DIR="/path/to/your/output_dir"          # [REQUIRED] Directory to save face detection results
+
+# === Choose your configuration method ===
+# Option 1: Use config file (RECOMMENDED for production)
+CONFIG_FILE="/path/to/your/face_detect_config.yaml"  # Path to YAML config file
+
+srun python -m hpc_inference.inference.detection.face_detect \
+    "${TARGET_DIR}" \
+    "${OUTPUT_DIR}" \
+    --input_type images \
+    --config "${CONFIG_FILE}"
+
+# Option 2: Use command line arguments (for quick testing)
+# Uncomment and modify the lines below, comment out the config version above
+#
+# srun python -m hpc_inference.inference.detection.face_detect \
+#     "${TARGET_DIR}" \
+#     "${OUTPUT_DIR}" \
+#     --input_type images \
+#     --model_weights "yolov8n-face.pt" \
+#     --confidence_threshold 0.5 \
+#     --image_size 1024 \
+#     --batch_size 16 \
+#     --num_workers 28 \
+#     --prefetch_factor 16 \
+#     --max_rows_per_file 10000 \
+#     --out_prefix "face_detection_results" \
+#     --uuid_mode filename \
+#     --evenly_distribute \
+#     --validate_images
+
+
+# -------------------------------
+# FACE DETECTION IMAGE FOLDER-SPECIFIC PARAMETERS
+# -------------------------------
+# --input_type images:     Tells the script to process image files from a directory
+# --model_weights:         YOLO face detection model file:
+#                          - "yolov8n-face.pt" (fastest, least accurate)
+#                          - "yolov8s-face.pt" (balanced)
+#                          - "yolov8m-face.pt" (more accurate)
+#                          - "yolov8l-face.pt" (most accurate, slowest)
+# --confidence_threshold:  Minimum confidence score for face detections (0.0-1.0)
+#                          Lower = more detections (including false positives)
+#                          Higher = fewer, more confident detections
+# --uuid_mode:             How to generate unique IDs from image paths:
+#                          - "filename": Use just the filename (image001.jpg)
+#                          - "relative": Use relative path from TARGET_DIR (subfolder/image001.jpg)
+#                          - "fullpath": Use full absolute path (/full/path/to/image001.jpg)
+#                          - "hash": Use MD5 hash of the full path (a1b2c3d4e5f6g7h8)
+# --validate_images:       [OPTIONAL] Validate that all images can be opened with PIL
+#                          Slower but safer - catches corrupted files before processing
+# --file_list:             NOT applicable for image folders (will cause error)
+
+# SUPPORTED IMAGE FORMATS:
+# .jpg, .jpeg, .png, .bmp, .tif, .tiff, .webp
+# Images are automatically converted to RGB mode for face detection
+
+# DIRECTORY STRUCTURE:
+# TARGET_DIR can contain:
+# - Flat structure: /images/img1.jpg, /images/img2.jpg, ...
+# - Nested structure: /images/class1/img1.jpg, /images/class2/img2.jpg, ...
+# All .jpg, .jpeg, .png, etc. files will be found recursively
+
+# OUTPUT FORMAT:
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (based on uuid_mode)
+# - detection_score: Maximum confidence score for face detection (0.0 if no faces detected)
+# Files are saved in: {OUTPUT_DIR}/detections/rank_{rank}/
+
+# -------------------------------
+# SLURM Template Field Explanations
+# -------------------------------
+# --job-name:          Name for your job in the queue/monitoring system.
+# --nodes:             Number of nodes to allocate for the job.
+# --gpus-per-task:     Number of GPUs per task (set to 1 unless using model parallelism).
+# --cpus-per-task:     Number of CPU cores per task (should match or exceed your data loader workers).
+# --ntasks-per-node:   Number of parallel tasks per node.
+#                      For face detection, balance between available GPUs and I/O capacity.
+# --partition:         Cluster partition/queue to submit to (e.g., gpu, gpu-exp).
+# --time:              Maximum walltime for the job (format: HH:MM:SS).
+# --output:            Path for standard output log file (use %j for job ID).
+# --error:             Path for standard error log file.
+# --account:           Your allocation/project account for resource usage.
+
+# PERFORMANCE TIPS FOR FACE DETECTION ON IMAGE FOLDERS:
+# - Face detection is typically faster than embedding, consider larger batch sizes
+# - Use --evenly_distribute for better load balancing when file sizes vary
+# - Use --validate_images if you suspect corrupted files (adds startup time)
+# - Consider --uuid_mode based on your downstream analysis needs
+# - Lower --confidence_threshold finds more faces but increases false positives
+# - Choose appropriate YOLO model size based on accuracy vs speed requirements
+# - For large datasets, consider converting to Parquet format first for better I/O performance
+
+# YOLO MODEL DOWNLOAD:
+# YOLO models will be automatically downloaded on first use
+# Ensure internet connectivity or pre-download models to avoid delays during job execution
diff --git a/scripts/face_detection/face_detect_parquet_template.slurm b/scripts/face_detection/face_detect_parquet_template.slurm
new file mode 100644
index 0000000..9704107
--- /dev/null
+++ b/scripts/face_detection/face_detect_parquet_template.slurm
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=face_detect_parquet_job     # [REQUIRED] Set a descriptive job name
+#SBATCH --nodes=NUM_NODES                      # [REQUIRED] Number of nodes to use
+#SBATCH --ntasks-per-node=TASKS_PER_NODE       # [RECOMMENDED] Number of tasks per node
+#SBATCH --gpus-per-task=1                      # [REQUIRED] Number of GPUs per task (set to 1)
+#SBATCH --cpus-per-task=CPUS_PER_TASK          # [RECOMMENDED] Number of CPU cores per task (e.g., 48)
+#SBATCH --partition=PARTITION_NAME             # [REQUIRED] Partition/queue name (e.g., gpu, gpu-exp)
+#SBATCH --time=HH:MM:SS                        # [REQUIRED] Walltime limit (e.g., 6:00:00)
+#SBATCH --output=logs/face_detect_parquet_%j.out  # [OPTIONAL] Stdout log file (%j = job ID)
+#SBATCH --error=logs/face_detect_parquet_%j.err   # [OPTIONAL] Stderr log file
+#SBATCH --account=ACCOUNT_NAME                 # [REQUIRED] Project account for allocation
+#SBATCH --mail-type=ALL                        # [OPTIONAL] Email notifications (BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=YOUR_EMAIL@domain.edu      # [OPTIONAL] Email address for notifications
+
+# === Load modules and activate environment ===
+module load cuda/VERSION                       # [REQUIRED] Load CUDA module (e.g., cuda/12.4.1)
+source /path/to/your/venv/bin/activate         # [REQUIRED] Activate your Python virtual environment
+
+# === Ensure package is installed ===
+# Make sure hpc-inference package with YOLO dependencies is installed
+# pip install 'hpc-inference[yolo]' or pip install ultralytics
+which python                                   # Print Python path for debugging
+
+# === Set data paths ===
+TARGET_DIR="/path/to/your/parquet_files"       # [REQUIRED] Directory containing input Parquet files
+OUTPUT_DIR="/path/to/your/output_dir"          # [REQUIRED] Directory to save face detection results
+
+# === Choose your configuration method ===
+# Option 1: Use config file (RECOMMENDED for production)
+CONFIG_FILE="/path/to/your/face_detect_parquet_config.yaml"  # Path to YAML config file
+FILE_LIST="/path/to/your/file_list.txt"        # [OPTIONAL] File with list of Parquet files
+
+srun python -m hpc_inference.inference.detection.face_detect \
+    "${TARGET_DIR}" \
+    "${OUTPUT_DIR}" \
+    --input_type parquet \
+    --config "${CONFIG_FILE}" \
+    --file_list "${FILE_LIST}"  # [OPTIONAL] Use this if you have a specific list of files to process
+
+# Option 2: Use command line arguments (for quick testing)
+# Uncomment and modify the lines below, comment out the config version above
+#
+# srun python -m hpc_inference.inference.detection.face_detect \
+#     "${TARGET_DIR}" \
+#     "${OUTPUT_DIR}" \
+#     --input_type parquet \
+#     --model_weights "yolov8n-face.pt" \
+#     --confidence_threshold 0.5 \
+#     --image_size 1024 \
+#     --batch_size 16 \
+#     --num_workers 28 \
+#     --prefetch_factor 16 \
+#     --read_batch_size 128 \
+#     --max_rows_per_file 10000 \
+#     --out_prefix "face_detection_results" \
+#     --read_columns uuid image original_size resized_size \
+#     --evenly_distribute
+
+
+# -------------------------------
+# FACE DETECTION PARQUET-SPECIFIC PARAMETERS
+# -------------------------------
+# --input_type parquet:    Tells the script to process Parquet files containing encoded images
+# --model_weights:         YOLO face detection model file:
+#                          - "yolov8n-face.pt" (fastest, least accurate)
+#                          - "yolov8s-face.pt" (balanced)
+#                          - "yolov8m-face.pt" (more accurate)
+#                          - "yolov8l-face.pt" (most accurate, slowest)
+# --confidence_threshold:  Minimum confidence score for face detections (0.0-1.0)
+#                          Lower = more detections (including false positives)
+#                          Higher = fewer, more confident detections
+# --read_batch_size:       Number of rows to read from Parquet files at once
+#                          Larger values use more memory but may improve I/O performance
+# --read_columns:          Columns to read from Parquet files (space-separated)
+#                          REQUIRED: uuid, image
+#                          OPTIONAL: original_size, resized_size, etc.
+# --file_list:             [OPTIONAL] Path to text file containing list of Parquet files to process
+#                          Useful for processing specific subsets of data
+# --evenly_distribute:     Distribute files evenly based on size for better load balancing
+# --stagger:               [OPTIONAL] Stagger worker start times to reduce I/O congestion
+
+# PARQUET DATA REQUIREMENTS:
+# Your Parquet files must contain:
+# 1. 'uuid' column: Unique string identifier for each image
+# 2. 'image' column: Image data encoded as bytes (from PIL Image.save() to BytesIO)
+# 3. Optional metadata columns as specified in --read_columns
+
+# OUTPUT FORMAT:
+# The script outputs Parquet files containing:
+# - uuid: Unique identifier for each image (from input Parquet)
+# - detection_score: Maximum confidence score for face detection (0.0 if no faces detected)
+# Files are saved in: {OUTPUT_DIR}/detections/rank_{rank}/
+
+# -------------------------------
+# SLURM Template Field Explanations
+# -------------------------------
+# --job-name:          Name for your job in the queue/monitoring system.
+# --nodes:             Number of nodes to allocate for the job.
+# --gpus-per-task:     Number of GPUs per task (set to 1 unless using model parallelism).
+# --cpus-per-task:     Number of CPU cores per task (should match or exceed your data loader workers).
+# --ntasks-per-node:   Number of parallel tasks per node.
+#                      For face detection, balance between available GPUs and I/O capacity.
+# --partition:         Cluster partition/queue to submit to (e.g., gpu, gpu-exp).
+# --time:              Maximum walltime for the job (format: HH:MM:SS).
+# --output:            Path for standard output log file (use %j for job ID).
+# --error:             Path for standard error log file.
+# --account:           Your allocation/project account for resource usage.
+
+# PERFORMANCE TIPS FOR FACE DETECTION ON PARQUET:
+# - Face detection is typically faster than embedding, consider larger batch sizes
+# - Use --evenly_distribute for better load balancing across workers
+# - Increase --read_batch_size for better I/O performance (watch memory usage)
+# - Use --file_list to process specific subsets of your data efficiently
+# - Lower --confidence_threshold finds more faces but increases false positives
+# - Choose appropriate YOLO model size based on accuracy vs speed requirements
+# - Consider --stagger if experiencing I/O bottlenecks during startup
+
+# YOLO MODEL DOWNLOAD:
+# YOLO models will be automatically downloaded on first use
+# Ensure internet connectivity or pre-download models to avoid delays during job execution
+
+# FILE_LIST FORMAT (if using --file_list):
+# Create a text file with one Parquet file path per line:
+# /path/to/data/file1.parquet
+# /path/to/data/file2.parquet
+# /path/to/data/file3.parquet
diff --git a/src/hpc_inference/inference/detection/__init__.py b/src/hpc_inference/inference/detection/__init__.py
new file mode 100644
index 0000000..5c698bf
--- /dev/null
+++ b/src/hpc_inference/inference/detection/__init__.py
@@ -0,0 +1,17 @@
+"""
+Detection modules for various YOLO-based detection tasks.
+"""
+
+from .base_detector import BaseDetector
+from .face_detector import FaceDetector
+from .animal_detector import AnimalDetector
+from .face_detect import main as face_detect_main
+from .animal_detect import main as animal_detect_main
+
+__all__ = [
+    "BaseDetector",
+    "FaceDetector",
+    "AnimalDetector", 
+    "face_detect_main",
+    "animal_detect_main"
+]
\ No newline at end of file
diff --git a/src/hpc_inference/inference/detection/animal_detect.py b/src/hpc_inference/inference/detection/animal_detect.py
new file mode 100644
index 0000000..e9e64b1
--- /dev/null
+++ b/src/hpc_inference/inference/detection/animal_detect.py
@@ -0,0 +1,364 @@
+import time
+import threading
+import torch
+import numpy as np
+from torch.utils.data import DataLoader
+import os
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Optional, Union, Any, List
+import pyarrow as pa
+import pyarrow.parquet as pq
+import json
+
+from ...datasets.parquet_dataset import ParquetImageDataset
+from ...datasets.image_folder_dataset import ImageFolderDataset
+from ...utils.common import format_time, decode_image, save_emb_to_parquet, load_config
+from ...utils.transforms import MegaDetector_v5_Transform
+from ...utils import profiling
+from .animal_detector import AnimalDetector
+
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()]
+)
+
+def save_detection_results(uuids: List[str], detection_results: List[Dict[str, Any]], output_file: str) -> None:
+    """Save animal detection results to parquet file using PyArrow for better performance."""
+    
+    # Extract data for parquet storage
+    max_scores = []
+    num_detections = []
+    detections_json = []
+    
+    for result in detection_results:
+        max_scores.append(result["max_detection_score"])
+        num_detections.append(result["num_detections"])
+        # Store detections as JSON string for complex data
+        detections_json.append(json.dumps(result["detections"]))
+    
+    # Create PyArrow table
+    table = pa.table({
+        'uuid': pa.array(uuids, type=pa.string()),
+        'max_detection_score': pa.array(max_scores, type=pa.float32()),
+        'num_detections': pa.array(num_detections, type=pa.int32()),
+        'detections': pa.array(detections_json, type=pa.string())
+    })
+    
+    # Write to parquet with optimized settings
+    pq.write_table(
+        table, 
+        output_file,
+        compression='snappy',
+        use_dictionary=True,
+        write_statistics=True
+    )
+    
+    total_detections = sum(num_detections)
+    logging.info(f"Saved {len(uuids)} images with {total_detections} total detections to {output_file}")
+
+@torch.no_grad()
+def main(
+    config: Dict[str, Any], 
+    target_dir: Union[str, Path], 
+    output_dir: Union[str, Path], 
+    input_type: str,
+    file_list: Optional[Union[str, Path]] = None
+) -> None:
+    """
+    Main function for YOLO animal detection using MegaDetector.
+    
+    Args:
+        config: Configuration dictionary containing model and processing parameters.
+        target_dir: Directory containing input data (Parquet files or images).
+        output_dir: Directory to save output detection results and profiles.
+        input_type: Type of input data ("images" or "parquet").
+        file_list: Optional file containing list of Parquet files to process.
+    """
+    # Validate input type
+    if input_type not in ["images", "parquet"]:
+        raise ValueError(f"Invalid input_type: {input_type}. Must be 'images' or 'parquet'")
+
+    # =============== #
+    # ---- Setup ----
+    # =============== #
+    start_time = time.time()
+    
+    global_rank = int(os.environ.get("SLURM_PROCID", 0))
+    local_rank = 0
+    world_size = int(os.environ.get("SLURM_NTASKS", 1))
+    logging.info(f"Global rank: {global_rank}, Local rank: {local_rank}, World size: {world_size}")
+    
+    base_output_dir = os.path.abspath(str(output_dir))
+    detections_output_dir = os.path.abspath(os.path.join(base_output_dir, "detections", f"rank_{global_rank}"))
+    os.makedirs(detections_output_dir, exist_ok=True)
+    profile_dir = os.path.abspath(os.path.join(base_output_dir, "profile_results", f"rank_{global_rank}"))
+    os.makedirs(profile_dir, exist_ok=True)
+    
+    device = f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu"
+
+    # Initialize and load animal detector
+    animal_detector = AnimalDetector(config, device)
+    animal_detector.load_model()
+    
+    # Create MegaDetector-specific preprocessing transforms
+    # Uses transforms from utils.transforms module (adapted from Microsoft CameraTraps)
+    image_size = config.get("image_size", 1280)  # MegaDetector standard input size
+    preprocess = MegaDetector_v5_Transform(target_size=image_size)
+    
+    logging.info(f"Using MegaDetector v5 transforms with image size: {image_size}x{image_size}")
+
+    # Create dataset based on input type
+    if input_type == "images":
+        logging.info(f"Processing image directory: {target_dir}")
+        
+        dataset = ImageFolderDataset(
+            target_dir, 
+            preprocess=preprocess,
+            validate=config.get("validate_images", False),
+            rank=global_rank,
+            world_size=world_size,
+            evenly_distribute=config.get("evenly_distribute", True),
+            stagger=config.get("stagger", False),
+            uuid_mode=config.get("uuid_mode", "filename")
+        )
+        
+    elif input_type == "parquet":
+        logging.info(f"Processing Parquet files from: {target_dir}")
+        
+        # Find all Parquet files in the target directory
+        target_path = Path(target_dir)
+        parquet_files = []
+        
+        if file_list is None:
+            parquet_files = [str(p) for p in target_path.rglob('*.parquet')]
+            logging.info(f"Found {len(parquet_files)} Parquet files in {target_dir}")
+        else:
+            file_list_path = Path(file_list)
+            if file_list_path.exists():
+                with open(file_list_path, "r") as f:
+                    parquet_files = [line.strip() for line in f if line.strip().endswith('.parquet')]
+                logging.info(f"Loaded {len(parquet_files)} Parquet files from {file_list}")
+            else:
+                raise FileNotFoundError(f"File list not found: {file_list}")
+        
+        if not parquet_files:
+            raise ValueError(f"No Parquet files found in {target_dir}")
+        
+        processed_files_log = os.path.join(
+            detections_output_dir, f"processed_files_rank{global_rank}_{datetime.now().strftime('%Y%m%d%H%M%S')}.log"
+        )
+        dataset = ParquetImageDataset(
+            parquet_files,
+            col_uuid="uuid",
+            rank=global_rank, 
+            world_size=world_size,  
+            evenly_distribute=config.get("evenly_distribute", True),
+            decode_fn=decode_image,
+            preprocess=preprocess,  
+            read_batch_size=config.get("read_batch_size", 128),
+            read_columns=config.get("read_columns", ["uuid", "original_size", "resized_size", "image"]),
+            stagger=config.get("stagger", False),
+            processed_files_log=processed_files_log
+        )
+
+    loader = DataLoader(
+        dataset,
+        batch_size=config.get("batch_size", 16),
+        shuffle=False,
+        num_workers=config.get("num_workers", 28),
+        pin_memory=True,
+        prefetch_factor=config.get("prefetch_factor", 16)
+    )
+
+    all_detection_results = []
+    all_uuids = []
+    all_batch_stats = []
+    usage_log = []
+    file_idx = 0
+    n_imgs_processed = 0
+
+    usage_stop = threading.Event()
+    usage_thread = threading.Thread(
+        target=profiling.start_usage_logging, 
+        args=(usage_log, usage_stop, 0.5, 0)
+    )
+    usage_thread.start()
+    
+    max_rows_per_file = config.get("max_rows_per_file", 10000)
+    out_prefix = config.get("out_prefix", "animal_detection_results")
+    conf_threshold = config.get("confidence_threshold", 0.2)
+
+    # Main batch loop
+    for batch_idx, (uuids, images) in enumerate(loader):
+        batch_stats = {"batch": batch_idx, "batch_size": len(uuids)}
+
+        t0 = time.perf_counter()
+        images = images.to(device)
+        t1 = time.perf_counter()
+        
+        # Run animal detection using AnimalDetector
+        detection_results = animal_detector.detect(images, conf_threshold)
+        t2 = time.perf_counter()
+        
+        all_detection_results.extend(detection_results)
+        all_uuids.extend(uuids)
+        n_imgs_processed += len(uuids)
+
+        # Count total detections in this batch for logging
+        batch_detections = sum(result["num_detections"] for result in detection_results)
+        # logging.info(f"Batch {batch_idx}: {len(uuids)} images, {batch_detections} detections")
+        
+        # Save results when reaching max_rows_per_file
+        if len(all_uuids) >= max_rows_per_file:
+            out_file = os.path.join(
+                detections_output_dir, 
+                f"{out_prefix}_rank_{global_rank}_{file_idx}.parquet"
+            )
+            save_detection_results(all_uuids, all_detection_results, out_file)
+            file_idx += 1
+            all_detection_results = []
+            all_uuids = []
+            
+        batch_stats.update({
+            "preprocessing_s": t1 - t0,
+            "inference_s": t2 - t1,
+            "total_batch_s": t2 - t0,
+            "detections_found": batch_detections
+        })
+        all_batch_stats.append(batch_stats)
+    
+    # Save remaining results
+    if len(all_uuids) > 0:
+        out_file = os.path.join(
+            detections_output_dir, 
+            f"{out_prefix}_rank_{global_rank}_{file_idx}.parquet"
+        )
+        save_detection_results(all_uuids, all_detection_results, out_file)
+
+    # Stop profiling and save results
+    usage_stop.set()
+    usage_thread.join()
+
+    elapsed = time.time() - start_time
+    total_detections = sum(stats.get("detections_found", 0) for stats in all_batch_stats)
+    
+    logging.info(f"Total images processed: {n_imgs_processed}")
+    logging.info(f"Total detections found: {total_detections}")
+    logging.info(f"Total time taken: {format_time(elapsed)}")
+    if n_imgs_processed > 0:
+        logging.info(f"Avg time/image: {elapsed/n_imgs_processed:.4f} sec")
+        logging.info(f"Throughput: {n_imgs_processed/elapsed:.2f} images/sec")
+        logging.info(f"Avg detections/image: {total_detections/n_imgs_processed:.2f}")
+    
+    profiling.log_computing_specs(
+        profile_dir, 
+        config.get("batch_size", 16), config.get("num_workers", 28),
+        extra_info={
+            "prefetch_factor": config.get("prefetch_factor", 16),
+            "read_batch_size": config.get("read_batch_size", 128),
+            "max_rows_per_file": config.get("max_rows_per_file", 10000),
+            "task": "Animal detection",
+            "model": config["model"]["weights"],
+            "confidence_threshold": conf_threshold,
+            "throughput": f"{n_imgs_processed/elapsed:.2f} images/sec" if n_imgs_processed > 0 else "0 images/sec",
+            "total_images": n_imgs_processed,
+            "total_detections": total_detections,
+            "avg_detections_per_image": f"{total_detections/n_imgs_processed:.2f}" if n_imgs_processed > 0 else "0",
+            "total_time_s": elapsed,
+            "input_type": input_type
+        }
+    )
+
+    stats_df = profiling.save_batch_stats(all_batch_stats, profile_dir)
+    usage_df = profiling.save_usage_log(usage_log, profile_dir)
+    profiling.save_usage_plots(usage_df, profile_dir)
+    profiling.save_batch_timings_plot(stats_df, profile_dir)
+    
+    
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="YOLO Animal Detection (MegaDetector) with Config File or Command Line Arguments")
+    parser.add_argument("target_dir", type=str, help="Directory containing input data")
+    parser.add_argument("output_dir", type=str, help="Directory to save output detection results")
+    parser.add_argument("--input_type", type=str, required=True, choices=["images", "parquet"],
+                        help="Type of input data: 'images' for image directory, 'parquet' for Parquet files")
+    parser.add_argument("--config", type=str, default=None, help="Path to YAML config file (optional)")
+    parser.add_argument("--file_list", type=str, default=None,
+                        help="File containing list of Parquet files to process (only for --input_type parquet)")
+    
+    # Model configuration arguments (used when no config file provided)
+    parser.add_argument("--model_weights", type=str, default="md_v5a.0.0.pt", 
+                        help="YOLO model weights file (default: md_v5a.0.0.pt - MegaDetector v5a)")
+    parser.add_argument("--confidence_threshold", type=float, default=0.2,
+                        help="Confidence threshold for animal detection (default: 0.2)")
+    parser.add_argument("--image_size", type=int, default=1280,
+                        help="Input image size for MegaDetector (default: 1280)")
+    
+    # Compute arguments
+    parser.add_argument("--batch_size", type=int, default=16, help="Batch size for inference")
+    parser.add_argument("--num_workers", type=int, default=20, help="Number of dataloader workers")
+    parser.add_argument("--prefetch_factor", type=int, default=16, help="Dataloader prefetch factor")
+    parser.add_argument("--read_batch_size", type=int, default=128, help="Parquet read batch size")
+    parser.add_argument("--max_rows_per_file", type=int, default=10000, help="Max rows per output file")
+    parser.add_argument("--out_prefix", type=str, default="animal_detection_results", help="Output file prefix")
+    parser.add_argument("--read_columns", type=str, nargs="+", 
+                        default=["uuid", "original_size", "resized_size", "image"],
+                        help="Columns to read from Parquet files (only for --input_type parquet)")
+    parser.add_argument("--evenly_distribute", action="store_true", default=True,
+                        help="Distribute files evenly based on size (recommended for better load balancing)")
+    parser.add_argument("--stagger", action="store_true", 
+                        help="Stagger worker start times")
+    
+    # Image folder specific arguments
+    parser.add_argument("--validate_images", action="store_true", 
+                        help="Validate images using PIL (slower but safer, only for --input_type images)")
+    parser.add_argument("--uuid_mode", type=str, default="filename", 
+                        choices=["filename", "relative", "fullpath", "hash"],
+                        help="How to generate UUIDs from image paths (only for --input_type images)")
+    
+    args = parser.parse_args()
+
+    # Validate argument combinations
+    if args.input_type == "parquet" and args.file_list and not os.path.exists(args.file_list):
+        parser.error(f"File list does not exist: {args.file_list}")
+    
+    if args.input_type == "images" and args.file_list:
+        parser.error("--file_list is only applicable when --input_type is 'parquet'")
+
+    # Load config or create from arguments
+    if args.config:
+        config = load_config(args.config)
+        print(f"Using config file: {args.config}")
+    else:
+        # Create config from command line arguments
+        config = {
+            "model": {
+                "weights": args.model_weights
+            },
+            "batch_size": args.batch_size,
+            "num_workers": args.num_workers,
+            "prefetch_factor": args.prefetch_factor,
+            "read_batch_size": args.read_batch_size,
+            "max_rows_per_file": args.max_rows_per_file,
+            "out_prefix": args.out_prefix,
+            "read_columns": args.read_columns,
+            "validate_images": args.validate_images,
+            "uuid_mode": args.uuid_mode,
+            "evenly_distribute": args.evenly_distribute,
+            "stagger": args.stagger,
+            "confidence_threshold": args.confidence_threshold,
+            "image_size": args.image_size
+        }
+        print("Using command line arguments (no config file provided)")
+
+    main(
+        config,
+        target_dir=args.target_dir,
+        output_dir=args.output_dir,
+        input_type=args.input_type,
+        file_list=args.file_list
+    )
diff --git a/src/hpc_inference/inference/detection/animal_detector.py b/src/hpc_inference/inference/detection/animal_detector.py
new file mode 100644
index 0000000..95e3ee5
--- /dev/null
+++ b/src/hpc_inference/inference/detection/animal_detector.py
@@ -0,0 +1,140 @@
+"""
+Animal detection using YOLO models (MegaDetector).
+"""
+from typing import List, Dict, Any
+import torch
+import numpy as np
+import logging
+
+from .base_detector import BaseDetector
+
+
+class AnimalDetector(BaseDetector):
+    """
+    YOLO-based animal detector using MegaDetector models.
+    
+    This detector loads animal detection models (e.g., MegaDetector from Microsoft)
+    and performs animal detection on image batches, returning both confidence scores
+    and bounding box coordinates.
+    """
+    
+    # MegaDetector class names
+    CLASS_NAMES = {
+        0: "animal",
+        1: "person",
+        2: "vehicle"
+    }
+    
+    def detect(self, images: torch.Tensor, conf_threshold: float = 0.2) -> List[Dict[str, Any]]:
+        """
+        Detect animals in a batch of images and return detection results.
+        
+        Args:
+            images: Batch of preprocessed images as tensor (B, C, H, W) 
+            conf_threshold: Confidence threshold for detections
+            
+        Returns:
+            List[Dict[str, Any]]: List of detection results, one dict per image.
+            
+            Each image result dict contains:
+            {
+                "max_detection_score": float,      # Maximum confidence score across all detections (0.0 if none)
+                "num_detections": int,             # Total number of detections above threshold
+                "detections": [                    # List of individual detection objects
+                    {
+                        "bbox": [x1, y1, x2, y2],          # Absolute pixel coordinates (float)
+                        "bbox_normalized": [x1, y1, x2, y2], # Normalized coordinates 0-1 (float)
+                        "confidence": float,                 # Detection confidence score (0.0-1.0)
+                        "class_id": int,                    # Numeric class ID (0=animal, 1=person, 2=vehicle)
+                        "class_name": str                   # Human-readable class name
+                    },
+                    # ... more detections
+                ]
+            }
+            
+            Example return for batch of 2 images:
+            [
+                {
+                    "max_detection_score": 0.85,
+                    "num_detections": 2,
+                    "detections": [
+                        {
+                            "bbox": [120.5, 80.2, 340.8, 290.1],
+                            "bbox_normalized": [0.118, 0.078, 0.333, 0.284],
+                            "confidence": 0.85,
+                            "class_id": 1,
+                            "class_name": "animal"
+                        },
+                        {
+                            "bbox": [450.0, 200.0, 600.0, 400.0],
+                            "bbox_normalized": [0.440, 0.195, 0.586, 0.391],
+                            "confidence": 0.72,
+                            "class_id": 2,
+                            "class_name": "person"
+                        }
+                    ]
+                },
+                {
+                    "max_detection_score": 0.0,
+                    "num_detections": 0,
+                    "detections": []
+                }
+            ]
+        """
+        if not self.is_loaded():
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+            
+        # Run inference on the entire batch at once
+        results = self.model(images, verbose=False)
+        
+        # Process detection results
+        batch_results = []
+        for result in results:
+            image_result = {
+                "max_detection_score": 0.0,
+                "detections": [],
+                "num_detections": 0
+            }
+            
+            if result.boxes is not None and len(result.boxes) > 0:
+                # Extract detection data
+                boxes = result.boxes.xyxy.cpu().numpy()  # [x1, y1, x2, y2]
+                confidences = result.boxes.conf.cpu().numpy()
+                class_ids = result.boxes.cls.cpu().numpy().astype(int)
+                
+                # Get image dimensions for normalization
+                img_height, img_width = result.orig_shape
+                
+                # Filter detections by confidence threshold
+                valid_indices = confidences >= conf_threshold
+                
+                if np.any(valid_indices):
+                    valid_boxes = boxes[valid_indices]
+                    valid_confidences = confidences[valid_indices]
+                    valid_class_ids = class_ids[valid_indices]
+                    
+                    # Store maximum confidence score
+                    image_result["max_detection_score"] = float(np.max(valid_confidences))
+                    image_result["num_detections"] = len(valid_confidences)
+                    
+                    # Process each detection
+                    for box, conf, class_id in zip(valid_boxes, valid_confidences, valid_class_ids):
+                        x1, y1, x2, y2 = box
+                        
+                        detection = {
+                            "bbox": [float(x1), float(y1), float(x2), float(y2)],  # Absolute coordinates
+                            "bbox_normalized": [  # Normalized coordinates [0-1]
+                                float(x1 / img_width), 
+                                float(y1 / img_height), 
+                                float(x2 / img_width), 
+                                float(y2 / img_height)
+                            ],
+                            "confidence": float(conf),
+                            "class_id": int(class_id),
+                            "class_name": self.CLASS_NAMES.get(int(class_id), "unknown")
+                        }
+                        image_result["detections"].append(detection)
+            
+            batch_results.append(image_result)
+        
+        return batch_results
diff --git a/src/hpc_inference/inference/detection/base_detector.py b/src/hpc_inference/inference/detection/base_detector.py
new file mode 100644
index 0000000..1972ce4
--- /dev/null
+++ b/src/hpc_inference/inference/detection/base_detector.py
@@ -0,0 +1,85 @@
+"""
+Base detector class for YOLO-based detection models.
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional
+import torch
+import logging
+
+try:
+    from ultralytics import YOLO
+    YOLO_AVAILABLE = True
+except ImportError:
+    YOLO_AVAILABLE = False
+    YOLO = None
+
+
+def check_yolo_dependencies() -> None:
+    """Check if YOLO dependencies are available."""
+    if not YOLO_AVAILABLE:
+        raise ImportError(
+            "Ultralytics YOLO is not installed. Install with: "
+            "pip install 'hpc-inference[yolo]' or pip install ultralytics"
+        )
+
+
+class BaseDetector(ABC):
+    """
+    Base class for YOLO-based detection models.
+    
+    This class provides a common interface for different detection tasks
+    (face detection, animal detection, etc.) that use YOLO models.
+    """
+    
+    def __init__(self, config: Dict[str, Any], device: Optional[str] = None):
+        """
+        Initialize the detector.
+        
+        Args:
+            config: Configuration dictionary containing model parameters
+            device: Device to run the model on (e.g., 'cuda:0', 'cpu'). 
+                   If None, will auto-detect.
+        """
+        check_yolo_dependencies()
+        
+        self.config = config
+        self.model = None
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        
+    def load_model(self) -> None:
+        """Load the YOLO model from weights."""
+        if "model" not in self.config:
+            raise ValueError("Config must contain 'model' section")
+        
+        model_config = self.config["model"]
+        if "weights" not in model_config:
+            raise ValueError("Model config must contain 'weights' field")
+            
+        weights_path = model_config["weights"]
+        logging.info(f"Loading YOLO model from: {weights_path}")
+        
+        self.model = YOLO(weights_path)
+        self.model.to(self.device)
+        
+        # Note: YOLO models are automatically in eval mode for inference
+        # Calling .eval() can trigger training initialization, so we avoid it
+        
+        logging.info(f"Model loaded successfully on device: {self.device}")
+        
+    @abstractmethod
+    def detect(self, images: torch.Tensor, conf_threshold: float = 0.5) -> Any:
+        """
+        Perform detection on a batch of images.
+        
+        Args:
+            images: Batch of preprocessed images as tensor (B, C, H, W)
+            conf_threshold: Confidence threshold for detections
+            
+        Returns:
+            Detection results (format depends on specific detector implementation)
+        """
+        pass
+        
+    def is_loaded(self) -> bool:
+        """Check if the model is loaded."""
+        return self.model is not None
diff --git a/src/hpc_inference/inference/detection/face_detect.py b/src/hpc_inference/inference/detection/face_detect.py
new file mode 100644
index 0000000..dd6039b
--- /dev/null
+++ b/src/hpc_inference/inference/detection/face_detect.py
@@ -0,0 +1,337 @@
+import time
+import threading
+import torch
+import numpy as np
+from torch.utils.data import DataLoader
+import os
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Optional, Union, Any, List
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from ...datasets.parquet_dataset import ParquetImageDataset
+from ...datasets.image_folder_dataset import ImageFolderDataset
+from ...utils.common import format_time, decode_image, load_config
+from ...utils import profiling
+from .face_detector import FaceDetector
+
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()]
+)
+
+def save_detection_results(uuids: List[str], scores: List[float], output_file: str) -> None:
+    """Save detection results to parquet file using PyArrow for better performance."""
+    
+    # Create PyArrow table directly
+    table = pa.table({
+        'uuid': pa.array(uuids, type=pa.string()),
+        'detection_score': pa.array(scores, type=pa.float32())
+    })
+    
+    # Write to parquet with optimized settings
+    pq.write_table(
+        table, 
+        output_file,
+        compression='snappy',
+        use_dictionary=True,
+        write_statistics=True
+    )
+
+@torch.no_grad()
+def main(
+    config: Dict[str, Any], 
+    target_dir: Union[str, Path], 
+    output_dir: Union[str, Path], 
+    input_type: str,
+    file_list: Optional[Union[str, Path]] = None
+) -> None:
+    """
+    Main function for YOLO face detection.
+    
+    Args:
+        config: Configuration dictionary containing model and processing parameters.
+        target_dir: Directory containing input data (Parquet files or images).
+        output_dir: Directory to save output detection results and profiles.
+        input_type: Type of input data ("images" or "parquet").
+        file_list: Optional file containing list of Parquet files to process.
+    """
+    # Validate input type
+    if input_type not in ["images", "parquet"]:
+        raise ValueError(f"Invalid input_type: {input_type}. Must be 'images' or 'parquet'")
+
+    # =============== #
+    # ---- Setup ----
+    # =============== #
+    start_time = time.time()
+    
+    global_rank = int(os.environ.get("SLURM_PROCID", 0))
+    local_rank = 0
+    world_size = int(os.environ.get("SLURM_NTASKS", 1))
+    logging.info(f"Global rank: {global_rank}, Local rank: {local_rank}, World size: {world_size}")
+    
+    base_output_dir = os.path.abspath(str(output_dir))
+    detections_output_dir = os.path.abspath(os.path.join(base_output_dir, "detections", f"rank_{global_rank}"))
+    os.makedirs(detections_output_dir, exist_ok=True)
+    profile_dir = os.path.abspath(os.path.join(base_output_dir, "profile_results", f"rank_{global_rank}"))
+    os.makedirs(profile_dir, exist_ok=True)
+    
+    device = f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu"
+
+    # Initialize and load face detector
+    face_detector = FaceDetector(config, device)
+    face_detector.load_model()
+    
+    # Simple preprocessing for YOLO (just resize)
+    from torchvision import transforms
+    image_size = config.get("image_size", 1024)
+    preprocess = transforms.Compose([
+        transforms.Resize((image_size, image_size)),
+        transforms.ToTensor()
+    ])
+
+    # Create dataset based on input type
+    if input_type == "images":
+        logging.info(f"Processing image directory: {target_dir}")
+        
+        dataset = ImageFolderDataset(
+            target_dir, 
+            preprocess=preprocess,
+            validate=config.get("validate_images", False),
+            rank=global_rank,
+            world_size=world_size,
+            evenly_distribute=config.get("evenly_distribute", True),
+            stagger=config.get("stagger", False),
+            uuid_mode=config.get("uuid_mode", "filename")
+        )
+        
+    elif input_type == "parquet":
+        logging.info(f"Processing Parquet files from: {target_dir}")
+        
+        # Find all Parquet files in the target directory
+        target_path = Path(target_dir)
+        parquet_files = []
+        
+        if file_list is None:
+            parquet_files = [str(p) for p in target_path.rglob('*.parquet')]
+            logging.info(f"Found {len(parquet_files)} Parquet files in {target_dir}")
+        else:
+            file_list_path = Path(file_list)
+            if file_list_path.exists():
+                with open(file_list_path, "r") as f:
+                    parquet_files = [line.strip() for line in f if line.strip().endswith('.parquet')]
+                logging.info(f"Loaded {len(parquet_files)} Parquet files from {file_list}")
+            else:
+                raise FileNotFoundError(f"File list not found: {file_list}")
+        
+        if not parquet_files:
+            raise ValueError(f"No Parquet files found in {target_dir}")
+        
+        processed_files_log = os.path.join(
+            detections_output_dir, f"processed_files_rank{global_rank}_{datetime.now().strftime('%Y%m%d%H%M%S')}.log"
+        )
+        dataset = ParquetImageDataset(
+            parquet_files,
+            col_uuid="uuid",
+            rank=global_rank, 
+            world_size=world_size,  
+            evenly_distribute=config.get("evenly_distribute", True),
+            decode_fn=decode_image,
+            preprocess=preprocess,  
+            read_batch_size=config.get("read_batch_size", 128),
+            read_columns=config.get("read_columns", ["uuid", "original_size", "resized_size", "image"]),
+            stagger=config.get("stagger", False),
+            processed_files_log=processed_files_log
+        )
+
+    loader = DataLoader(
+        dataset,
+        batch_size=config.get("batch_size", 16),
+        shuffle=False,
+        num_workers=config.get("num_workers", 28),
+        pin_memory=True,
+        prefetch_factor=config.get("prefetch_factor", 16)
+    )
+
+    all_detection_scores = []
+    all_uuids = []
+    all_batch_stats = []
+    usage_log = []
+    file_idx = 0
+    n_imgs_processed = 0
+
+    usage_stop = threading.Event()
+    usage_thread = threading.Thread(
+        target=profiling.start_usage_logging, 
+        args=(usage_log, usage_stop, 0.5, 0)
+    )
+    usage_thread.start()
+    
+    max_rows_per_file = config.get("max_rows_per_file", 10000)
+    out_prefix = config.get("out_prefix", "face_detection_results")
+    conf_threshold = config.get("confidence_threshold", 0.5)
+
+    # Main batch loop
+    for batch_idx, (uuids, images) in enumerate(loader):
+        batch_stats = {"batch": batch_idx, "batch_size": len(uuids)}
+
+        t0 = time.perf_counter()
+        images = images.to(device)
+        t1 = time.perf_counter()
+        
+        # Run face detection using FaceDetector
+        detection_scores = face_detector.detect(images, conf_threshold)
+        t2 = time.perf_counter()
+        
+        all_detection_scores.extend(detection_scores)
+        all_uuids.extend(uuids)
+        n_imgs_processed += len(uuids)
+
+        
+        # Save results when reaching max_rows_per_file
+        if len(all_uuids) >= max_rows_per_file:
+            out_file = os.path.join(
+                detections_output_dir, 
+                f"{out_prefix}_rank_{global_rank}_{file_idx}.parquet"
+            )
+            save_detection_results(all_uuids, all_detection_scores, out_file)
+            file_idx += 1
+            all_detection_scores = []
+            all_uuids = []
+            
+        batch_stats.update({
+            "preprocessing_s": t1 - t0,
+            "inference_s": t2 - t1,
+            "total_batch_s": t2 - t0,
+        })
+        all_batch_stats.append(batch_stats)
+    
+    # Save remaining results
+    if len(all_uuids) > 0:
+        out_file = os.path.join(
+            detections_output_dir, 
+            f"{out_prefix}_rank_{global_rank}_{file_idx}.parquet"
+        )
+        save_detection_results(all_uuids, all_detection_scores, out_file)
+
+    # Stop profiling and save results
+    usage_stop.set()
+    usage_thread.join()
+
+    elapsed = time.time() - start_time
+    logging.info(f"Total images processed: {n_imgs_processed}")
+    logging.info(f"Total time taken: {format_time(elapsed)}")
+    if n_imgs_processed > 0:
+        logging.info(f"Avg time/image: {elapsed/n_imgs_processed:.4f} sec")
+        logging.info(f"Throughput: {n_imgs_processed/elapsed:.2f} images/sec")
+    
+    profiling.log_computing_specs(
+        profile_dir, 
+        config.get("batch_size", 16), config.get("num_workers", 28),
+        extra_info={
+            "prefetch_factor": config.get("prefetch_factor", 16),
+            "read_batch_size": config.get("read_batch_size", 128),
+            "max_rows_per_file": config.get("max_rows_per_file", 10000),
+            "task": "Face detection",
+            "model": config["model"]["weights"],
+            "confidence_threshold": conf_threshold,
+            "throughput": f"{n_imgs_processed/elapsed:.2f} images/sec" if n_imgs_processed > 0 else "0 images/sec",
+            "total_images": n_imgs_processed,
+            "total_time_s": elapsed,
+            "input_type": input_type
+        }
+    )
+
+    stats_df = profiling.save_batch_stats(all_batch_stats, profile_dir)
+    usage_df = profiling.save_usage_log(usage_log, profile_dir)
+    profiling.save_usage_plots(usage_df, profile_dir)
+    profiling.save_batch_timings_plot(stats_df, profile_dir)
+    
+    
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="YOLO Face Detection with Config File or Command Line Arguments")
+    parser.add_argument("target_dir", type=str, help="Directory containing input data")
+    parser.add_argument("output_dir", type=str, help="Directory to save output detection results")
+    parser.add_argument("--input_type", type=str, required=True, choices=["images", "parquet"],
+                        help="Type of input data: 'images' for image directory, 'parquet' for Parquet files")
+    parser.add_argument("--config", type=str, default=None, help="Path to YAML config file (optional)")
+    parser.add_argument("--file_list", type=str, default=None,
+                        help="File containing list of Parquet files to process (only for --input_type parquet)")
+    
+    # Model configuration arguments (used when no config file provided)
+    parser.add_argument("--model_weights", type=str, default="yolov8n-face.pt", 
+                        help="YOLO model weights file (default: yolov8n-face.pt)")
+    parser.add_argument("--confidence_threshold", type=float, default=0.5,
+                        help="Confidence threshold for face detection (default: 0.5)")
+    parser.add_argument("--image_size", type=int, default=1024,
+                        help="Input image size for YOLO model (square, default: 1024)")
+    
+    # Compute arguments
+    parser.add_argument("--batch_size", type=int, default=16, help="Batch size for inference")
+    parser.add_argument("--num_workers", type=int, default=28, help="Number of dataloader workers")
+    parser.add_argument("--prefetch_factor", type=int, default=16, help="Dataloader prefetch factor")
+    parser.add_argument("--read_batch_size", type=int, default=128, help="Parquet read batch size")
+    parser.add_argument("--max_rows_per_file", type=int, default=10000, help="Max rows per output file")
+    parser.add_argument("--out_prefix", type=str, default="face_detection_results", help="Output file prefix")
+    parser.add_argument("--read_columns", type=str, nargs="+", 
+                        default=["uuid", "original_size", "resized_size", "image"],
+                        help="Columns to read from Parquet files (only for --input_type parquet)")
+    parser.add_argument("--evenly_distribute", action="store_true", default=True,
+                        help="Distribute files evenly based on size (recommended for better load balancing)")
+    parser.add_argument("--stagger", action="store_true", 
+                        help="Stagger worker start times")
+    
+    # Image folder specific arguments
+    parser.add_argument("--validate_images", action="store_true", 
+                        help="Validate images using PIL (slower but safer, only for --input_type images)")
+    parser.add_argument("--uuid_mode", type=str, default="filename", 
+                        choices=["filename", "relative", "fullpath", "hash"],
+                        help="How to generate UUIDs from image paths (only for --input_type images)")
+    
+    args = parser.parse_args()
+
+    # Validate argument combinations
+    if args.input_type == "parquet" and args.file_list and not os.path.exists(args.file_list):
+        parser.error(f"File list does not exist: {args.file_list}")
+    
+    if args.input_type == "images" and args.file_list:
+        parser.error("--file_list is only applicable when --input_type is 'parquet'")
+
+    # Load config or create from arguments
+    if args.config:
+        config = load_config(args.config)
+        print(f"Using config file: {args.config}")
+    else:
+        # Create config from command line arguments
+        config = {
+            "model": {
+                "weights": args.model_weights
+            },
+            "batch_size": args.batch_size,
+            "num_workers": args.num_workers,
+            "prefetch_factor": args.prefetch_factor,
+            "read_batch_size": args.read_batch_size,
+            "max_rows_per_file": args.max_rows_per_file,
+            "out_prefix": args.out_prefix,
+            "read_columns": args.read_columns,
+            "validate_images": args.validate_images,
+            "uuid_mode": args.uuid_mode,
+            "evenly_distribute": args.evenly_distribute,
+            "stagger": args.stagger,
+            "confidence_threshold": args.confidence_threshold,
+            "image_size": args.image_size
+        }
+        print("Using command line arguments (no config file provided)")
+
+    main(
+        config,
+        target_dir=args.target_dir,
+        output_dir=args.output_dir,
+        input_type=args.input_type,
+        file_list=args.file_list
+    )
diff --git a/src/hpc_inference/inference/detection/face_detector.py b/src/hpc_inference/inference/detection/face_detector.py
new file mode 100644
index 0000000..2779ec5
--- /dev/null
+++ b/src/hpc_inference/inference/detection/face_detector.py
@@ -0,0 +1,51 @@
+"""
+Face detection using YOLO models.
+"""
+from typing import List
+import torch
+import logging
+
+from .base_detector import BaseDetector
+
+
+class FaceDetector(BaseDetector):
+    """
+    YOLO-based face detector.
+    
+    This detector loads face detection models (e.g., from yolo-face repo)
+    and performs face detection on image batches.
+    """
+    
+    def detect(self, images: torch.Tensor, conf_threshold: float = 0.5) -> List[float]:
+        """
+        Detect faces in a batch of images and return detection scores.
+        
+        Args:
+            images: Batch of preprocessed images as tensor (B, C, H, W) 
+            conf_threshold: Confidence threshold for detections
+            
+        Returns:
+            List of detection scores (max confidence score per image)
+        """
+        if not self.is_loaded():
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+            
+        # Run inference on the entire batch at once
+        results = self.model(images, verbose=False)
+        
+        # Process detection scores
+        detection_scores = []
+        for result in results:
+            if result.boxes is not None and len(result.boxes) > 0:
+                confidences = result.boxes.conf
+                # Keep operations on GPU until final conversion
+                valid_confidences = confidences[confidences >= conf_threshold]
+                if len(valid_confidences) > 0:
+                    max_score = float(torch.max(valid_confidences).cpu())
+                else:
+                    max_score = 0.0
+            else:
+                max_score = 0.0
+            detection_scores.append(max_score)
+        
+        return detection_scores
diff --git a/src/hpc_inference/utils/__init__.py b/src/hpc_inference/utils/__init__.py
index d98865e..7e3b65d 100644
--- a/src/hpc_inference/utils/__init__.py
+++ b/src/hpc_inference/utils/__init__.py
@@ -9,6 +9,14 @@
     multi_model_collate,
     pil_image_collate,
 )
+from .transforms import letterbox, MegaDetector_v5_Transform
+from .visualization import (
+    reverse_letterbox_coords,
+    plot_detections_matplotlib,
+    plot_detections_pil,
+    save_detection_visualization,
+    get_class_colors
+)
 from . import profiling
 
 __all__ = [
@@ -16,6 +24,13 @@
     "save_emb_to_parquet",
     "format_time",
     "load_config",
+    "letterbox",
+    "MegaDetector_v5_Transform",
+    "reverse_letterbox_coords",
+    "plot_detections_matplotlib", 
+    "plot_detections_pil",
+    "save_detection_visualization",
+    "get_class_colors",
     "assign_files_to_rank",
     "assign_indices_to_rank", 
     "get_distributed_info",
diff --git a/src/hpc_inference/utils/transforms.py b/src/hpc_inference/utils/transforms.py
new file mode 100644
index 0000000..141252e
--- /dev/null
+++ b/src/hpc_inference/utils/transforms.py
@@ -0,0 +1,298 @@
+"""
+Image transformation utilities for computer vision models.
+
+This module contains preprocessing transforms adapted from various sources,
+primarily for object detection models like YOLO and MegaDetector.
+
+Copyright attributions are included for each function/class as appropriate.
+"""
+
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+import numpy as np
+from PIL import Image
+from typing import Union, Tuple
+
+
+def letterbox(
+    im: Union[Image.Image, torch.Tensor], 
+    new_shape: Union[int, Tuple[int, int]] = (640, 640), 
+    color: Tuple[int, int, int] = (114, 114, 114), 
+    auto: bool = False, 
+    scaleFill: bool = False, 
+    scaleup: bool = True, 
+    stride: int = 32
+) -> torch.Tensor:
+    """
+    Resize and pad an image to a desired shape while keeping the aspect ratio unchanged.
+
+    This function is commonly used in object detection tasks to prepare images for models 
+    like YOLOv5 and MegaDetector. It resizes the image to fit into the new shape with the 
+    correct aspect ratio and then pads the rest with a specified color.
+
+    Based on letterbox implementation from Microsoft CameraTraps PytorchWildlife.
+    Copyright (c) Microsoft Corporation. All rights reserved.
+    Licensed under the MIT License.
+    
+    Original source: https://github.com/microsoft/CameraTraps/blob/main/PytorchWildlife/data/transforms.py
+
+    Args:
+        im (PIL.Image.Image or torch.Tensor): The input image. Can be a PIL image or 
+            a PyTorch tensor with shape (C, H, W) and values in [0, 1].
+        new_shape (int or tuple, optional): The target size of the image. If int, creates
+            a square image (new_shape, new_shape). If tuple, should be (height, width).
+            Defaults to (640, 640).
+        color (tuple, optional): The RGB color values used for padding, range [0, 255].
+            Defaults to (114, 114, 114) which is a gray color commonly used in YOLO.
+        auto (bool, optional): If True, adjusts padding to ensure the padded image 
+            dimensions are a multiple of the stride. Defaults to False.
+        scaleFill (bool, optional): If True, scales the image to fill the new shape 
+            exactly, ignoring aspect ratio (may cause distortion). Defaults to False.
+        scaleup (bool, optional): If True, allows the function to scale up the image.
+            If False, only scales down. Defaults to True.
+        stride (int, optional): The stride used in the model. When auto=True, padding 
+            is adjusted to be a multiple of this stride. Defaults to 32.
+
+    Returns:
+        torch.Tensor: The transformed image as a tensor with shape (C, H, W) and 
+            values in [0, 1]. The output will have exactly the dimensions specified 
+            by new_shape.
+
+    Examples:
+        >>> from PIL import Image
+        >>> import torch
+        >>> 
+        >>> # Load and transform a PIL image
+        >>> pil_img = Image.open("photo.jpg")  # e.g., 800x600 RGB image
+        >>> transformed = letterbox(pil_img, new_shape=640)
+        >>> print(transformed.shape)  # torch.Size([3, 640, 640])
+        >>> 
+        >>> # Transform a tensor image
+        >>> tensor_img = torch.rand(3, 480, 320)  # Random image tensor
+        >>> transformed = letterbox(tensor_img, new_shape=(1280, 1280), scaleup=False)
+        >>> print(transformed.shape)  # torch.Size([3, 1280, 1280])
+        >>> 
+        >>> # Use with auto padding for model stride
+        >>> transformed = letterbox(pil_img, new_shape=640, auto=True, stride=32)
+        >>> # Output dimensions will be multiples of 32
+
+    Note:
+        - Input PIL images are automatically converted to tensors
+        - The function preserves aspect ratio unless scaleFill=True
+        - Padding is applied symmetrically when possible
+        - Output tensor values are normalized to [0, 1] range
+    """
+    # Convert PIL Image to Torch Tensor
+    if isinstance(im, Image.Image):
+        im = T.ToTensor()(im)
+
+    # Original shape
+    shape = im.shape[1:]  # shape = [height, width]
+
+    # New shape - convert int to tuple if needed
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old) and compute padding
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:
+        r = min(r, 1.0)
+
+    new_unpad = (int(round(shape[1] * r)), int(round(shape[0] * r)))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
+
+    if auto:
+        # Make padding a multiple of stride
+        dw, dh = dw % stride, dh % stride
+    elif scaleFill:
+        # Scale to fill entire new_shape, ignore aspect ratio
+        dw, dh = 0, 0
+        new_unpad = new_shape
+        r = new_shape[1] / shape[1], new_shape[0] / shape[0]
+
+    # Divide padding by 2 for symmetric padding
+    dw /= 2
+    dh /= 2
+   
+    # Resize image if current size != target unpadded size
+    if shape[::-1] != new_unpad:
+        resize_transform = T.Resize(
+            new_unpad[::-1], 
+            interpolation=T.InterpolationMode.BILINEAR,
+            antialias=False
+        )
+        im = resize_transform(im)
+
+    # Apply padding
+    # Padding format: (left, right, top, bottom)
+    padding = (
+        int(round(dw - 0.1)), 
+        int(round(dw + 0.1)), 
+        int(round(dh - 0.1)), 
+        int(round(dh + 0.1))
+    )
+    # Scale to [0,255], pad with color value, then scale back to [0,1]
+    im = F.pad(im * 255.0, padding, value=color[0]) / 255.0
+
+    return im
+
+
+class MegaDetector_v5_Transform:
+    """
+    A transformation class to preprocess images for the MegaDetector v5 model.
+    
+    This transform handles the complete preprocessing pipeline required for MegaDetector,
+    including image format conversion, tensor conversion, normalization, and the specific
+    letterbox resizing used by YOLO-based detection models.
+
+    Based on Microsoft CameraTraps PytorchWildlife transforms.
+    Copyright (c) Microsoft Corporation. All rights reserved.
+    Licensed under the MIT License.
+    
+    Original source: https://github.com/microsoft/CameraTraps/blob/main/PytorchWildlife/data/transforms.py
+
+    This transformation is specifically designed for YOLOv5-based MegaDetector models
+    and ensures proper preprocessing for optimal detection performance.
+
+    Attributes:
+        target_size (int): The target size for the image's longest side after resizing.
+        stride (int): Stride value used for padding calculations in letterbox transform.
+
+    Examples:
+        >>> from PIL import Image
+        >>> import numpy as np
+        >>> 
+        >>> # Create transform for MegaDetector
+        >>> transform = MegaDetector_v5_Transform(target_size=1280, stride=32)
+        >>> 
+        >>> # Transform a PIL image
+        >>> pil_img = Image.open("wildlife_photo.jpg")
+        >>> tensor_output = transform(pil_img)
+        >>> print(tensor_output.shape)  # torch.Size([3, 1280, 1280])
+        >>> 
+        >>> # Transform a numpy array
+        >>> np_img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
+        >>> tensor_output = transform(np_img)
+        >>> print(tensor_output.shape)  # torch.Size([3, 1280, 1280])
+        >>> 
+        >>> # Use in a data pipeline
+        >>> from torchvision.datasets import ImageFolder
+        >>> dataset = ImageFolder(root='images/', transform=transform)
+
+    Note:
+        - Handles both PIL Images and numpy arrays as input
+        - Always outputs a torch.Tensor with values in [0, 1] range
+        - Preserves aspect ratio using letterbox padding
+        - Optimized for MegaDetector model input requirements
+    """
+
+    def __init__(self, target_size: int = 1280, stride: int = 32):
+        """
+        Initialize the MegaDetector v5 transform.
+
+        Args:
+            target_size (int, optional): Desired size for the image's square output 
+                dimensions. MegaDetector typically uses 1280x1280. Defaults to 1280.
+            stride (int, optional): Stride value for letterbox padding calculations.
+                Should match the model's architectural stride. Defaults to 32.
+
+        Examples:
+            >>> # Standard MegaDetector transform
+            >>> transform = MegaDetector_v5_Transform()
+            >>> 
+            >>> # Custom size for different model variants
+            >>> transform = MegaDetector_v5_Transform(target_size=640, stride=32)
+            >>> 
+            >>> # High resolution for better accuracy
+            >>> transform = MegaDetector_v5_Transform(target_size=1920, stride=64)
+        """
+        self.target_size = target_size
+        self.stride = stride
+
+    def __call__(self, image: Union[np.ndarray, Image.Image]) -> torch.Tensor:
+        """
+        Apply the transformation to the provided image.
+
+        This method handles the complete preprocessing pipeline:
+        1. Convert PIL Image to numpy array if needed
+        2. Convert numpy array to torch tensor with proper channel ordering
+        3. Normalize pixel values to [0, 1] range
+        4. Apply letterbox transform for aspect-ratio preserving resize and padding
+
+        Args:
+            image (np.ndarray or PIL.Image): Input image. Can be:
+                - PIL Image in any mode (will be converted to RGB)
+                - numpy array with shape (H, W, C) and dtype uint8 or float
+                - Values should be in range [0, 255] for uint8 or [0, 1] for float
+
+        Returns:
+            torch.Tensor: Transformed image tensor with shape (C, H, W) where:
+                - C = 3 (RGB channels)
+                - H = W = target_size
+                - Values are in range [0, 1]
+                - Aspect ratio is preserved with gray padding
+
+        Raises:
+            TypeError: If input image is not a PIL Image or numpy array
+            ValueError: If numpy array doesn't have the expected shape
+
+        Examples:
+            >>> transform = MegaDetector_v5_Transform(target_size=640)
+            >>> 
+            >>> # PIL Image input
+            >>> pil_img = Image.open("animal.jpg")  # e.g., 1200x800 image
+            >>> result = transform(pil_img)
+            >>> print(result.shape, result.dtype, result.min(), result.max())
+            >>> # torch.Size([3, 640, 640]) torch.float32 tensor(0.) tensor(1.)
+            >>> 
+            >>> # Numpy array input
+            >>> np_img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
+            >>> result = transform(np_img)
+            >>> print(result.shape)  # torch.Size([3, 640, 640])
+            >>> 
+            >>> # Grayscale PIL image (automatically converted to RGB)
+            >>> gray_img = Image.open("grayscale.jpg").convert('L')
+            >>> result = transform(gray_img.convert('RGB'))
+        """
+        # Convert PIL Image to numpy array if needed
+        if isinstance(image, Image.Image):
+            # Ensure RGB mode for consistent 3-channel output
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            image = np.array(image)
+        elif not isinstance(image, np.ndarray):
+            raise TypeError(f"Expected PIL Image or numpy array, got {type(image)}")
+            
+        # Convert numpy array to PyTorch tensor with proper formatting
+        if isinstance(image, np.ndarray):
+            # Validate shape
+            if image.ndim != 3:
+                raise ValueError(f"Expected 3D array (H, W, C), got shape {image.shape}")
+            if image.shape[2] != 3:
+                raise ValueError(f"Expected 3 channels (RGB), got {image.shape[2]}")
+                
+            # Convert from HWC to CHW format
+            image = image.transpose((2, 0, 1))
+            image = np.ascontiguousarray(image)
+            image = torch.from_numpy(image).float()
+            
+            # Normalize to [0, 1] range if needed
+            if image.max() > 1.0:
+                image /= 255.0
+
+        # Apply letterbox transform for aspect-ratio preserving resize and padding
+        transformed_image = letterbox(
+            image, 
+            new_shape=self.target_size, 
+            stride=self.stride, 
+            auto=False
+        )
+
+        return transformed_image
+
+    def __repr__(self) -> str:
+        """String representation of the transform."""
+        return (f"{self.__class__.__name__}("
+                f"target_size={self.target_size}, "
+                f"stride={self.stride})")
diff --git a/src/hpc_inference/utils/visualization.py b/src/hpc_inference/utils/visualization.py
new file mode 100644
index 0000000..1f27ebd
--- /dev/null
+++ b/src/hpc_inference/utils/visualization.py
@@ -0,0 +1,406 @@
+"""
+Visualization utilities for detection results.
+
+This module provides functions to visualize detection results on original images,
+handling the coordinate transformation from preprocessed (letterbox) coordinates
+back to original image coordinates.
+"""
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib import colors as mcolors
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from typing import List, Dict, Any, Tuple, Union, Optional
+import torch
+
+
+def reverse_letterbox_coords(
+    bbox: List[float], 
+    original_shape: Tuple[int, int], 
+    letterbox_shape: Tuple[int, int]
+) -> List[float]:
+    """
+    Convert bounding box coordinates from letterbox (preprocessed) space back to original image space.
+    
+    The letterbox transform preserves aspect ratio by:
+    1. Scaling the image to fit within the target size
+    2. Adding symmetric padding to make it square
+    
+    This function reverses that transformation to map coordinates back to the original image.
+    
+    Args:
+        bbox (List[float]): Bounding box coordinates in letterbox space [x1, y1, x2, y2]
+        original_shape (Tuple[int, int]): Original image dimensions (height, width)
+        letterbox_shape (Tuple[int, int]): Letterbox dimensions (height, width), typically (1280, 1280)
+        
+    Returns:
+        List[float]: Bounding box coordinates in original image space [x1, y1, x2, y2]
+        
+    Example:
+        >>> # Original image is 1200x800, letterbox is 1280x1280
+        >>> # Detection bbox in letterbox space
+        >>> letterbox_bbox = [640.0, 320.0, 960.0, 640.0]
+        >>> original_bbox = reverse_letterbox_coords(
+        ...     letterbox_bbox, 
+        ...     (800, 1200),  # original (H, W)
+        ...     (1280, 1280)  # letterbox (H, W)
+        ... )
+        >>> print(original_bbox)  # Coordinates in original 1200x800 image
+    """
+    x1, y1, x2, y2 = bbox
+    orig_h, orig_w = original_shape
+    letterbox_h, letterbox_w = letterbox_shape
+    
+    # Calculate the scaling ratio used in letterbox transform
+    # The letterbox transform uses min ratio to preserve aspect ratio
+    scale_ratio = min(letterbox_h / orig_h, letterbox_w / orig_w)
+    
+    # Calculate the scaled dimensions (before padding)
+    scaled_w = int(round(orig_w * scale_ratio))
+    scaled_h = int(round(orig_h * scale_ratio))
+    
+    # Calculate the padding added
+    pad_w = (letterbox_w - scaled_w) / 2
+    pad_h = (letterbox_h - scaled_h) / 2
+    
+    # Remove padding from coordinates
+    x1_unpadded = x1 - pad_w
+    y1_unpadded = y1 - pad_h
+    x2_unpadded = x2 - pad_w
+    y2_unpadded = y2 - pad_h
+    
+    # Scale back to original image size
+    x1_original = x1_unpadded / scale_ratio
+    y1_original = y1_unpadded / scale_ratio
+    x2_original = x2_unpadded / scale_ratio
+    y2_original = y2_unpadded / scale_ratio
+    
+    # Ensure coordinates are within image bounds
+    x1_original = max(0, min(x1_original, orig_w))
+    y1_original = max(0, min(y1_original, orig_h))
+    x2_original = max(0, min(x2_original, orig_w))
+    y2_original = max(0, min(y2_original, orig_h))
+    
+    return [x1_original, y1_original, x2_original, y2_original]
+
+
+def get_class_colors() -> Dict[str, str]:
+    """
+    Get predefined colors for different detection classes.
+    
+    Returns:
+        Dict[str, str]: Mapping of class names to hex color codes
+    """
+    return {
+        'animal': '#FF6B6B',    # Red
+        'person': '#4ECDC4',    # Teal
+        'vehicle': '#45B7D1',   # Blue
+        'unknown': '#96CEB4'    # Green
+    }
+
+
+def plot_detections_matplotlib(
+    image: Union[np.ndarray, Image.Image], 
+    detections: List[Dict[str, Any]],
+    original_shape: Optional[Tuple[int, int]] = None,
+    letterbox_shape: Tuple[int, int] = (1280, 1280),
+    confidence_threshold: float = 0.0,
+    show_confidence: bool = True,
+    show_class_names: bool = True,
+    figsize: Tuple[int, int] = (12, 8),
+    title: str = "Detection Results"
+) -> plt.Figure:
+    """
+    Plot detection results on an image using matplotlib.
+    
+    Args:
+        image (np.ndarray or PIL.Image): Original image to plot detections on
+        detections (List[Dict]): List of detection dictionaries from AnimalDetector.detect()
+        original_shape (Tuple[int, int], optional): Original image shape (height, width).
+            If None, inferred from image.
+        letterbox_shape (Tuple[int, int]): Shape used during preprocessing (height, width)
+        confidence_threshold (float): Only show detections above this threshold
+        show_confidence (bool): Whether to show confidence scores in labels
+        show_class_names (bool): Whether to show class names in labels
+        figsize (Tuple[int, int]): Figure size for matplotlib
+        title (str): Plot title
+        
+    Returns:
+        plt.Figure: Matplotlib figure with plotted detections
+        
+    Example:
+        >>> from PIL import Image
+        >>> # Load original image and detection results
+        >>> img = Image.open("wildlife_photo.jpg")
+        >>> detections = [
+        ...     {
+        ...         "bbox": [640.0, 320.0, 960.0, 640.0],  # letterbox coordinates
+        ...         "confidence": 0.85,
+        ...         "class_name": "animal"
+        ...     }
+        ... ]
+        >>> fig = plot_detections_matplotlib(img, detections, show_confidence=True)
+        >>> plt.show()
+    """
+    # Convert PIL Image to numpy array if needed
+    if isinstance(image, Image.Image):
+        image_array = np.array(image)
+        if original_shape is None:
+            original_shape = (image.height, image.width)
+    else:
+        image_array = image
+        if original_shape is None:
+            original_shape = (image.shape[0], image.shape[1])
+    
+    # Create figure and axis
+    fig, ax = plt.subplots(1, 1, figsize=figsize)
+    ax.imshow(image_array)
+    ax.set_title(title, fontsize=14, fontweight='bold')
+    ax.axis('off')
+    
+    # Get class colors
+    class_colors = get_class_colors()
+    
+    # Plot each detection
+    detection_count = 0
+    for detection in detections:
+        confidence = detection.get('confidence', 0.0)
+        
+        # Skip low confidence detections
+        if confidence < confidence_threshold:
+            continue
+            
+        # Get detection info
+        bbox_letterbox = detection['bbox']
+        class_name = detection.get('class_name', 'unknown')
+        
+        # Convert coordinates back to original image space
+        bbox_original = reverse_letterbox_coords(bbox_letterbox, original_shape, letterbox_shape)
+        x1, y1, x2, y2 = bbox_original
+        
+        # Calculate box dimensions
+        width = x2 - x1
+        height = y2 - y1
+        
+        # Skip very small or invalid boxes
+        if width <= 0 or height <= 0:
+            continue
+            
+        # Get color for this class
+        color = class_colors.get(class_name, class_colors['unknown'])
+        
+        # Create rectangle patch
+        rect = patches.Rectangle(
+            (x1, y1), width, height, 
+            linewidth=2, 
+            edgecolor=color, 
+            facecolor='none',
+            alpha=0.8
+        )
+        ax.add_patch(rect)
+        
+        # Create label
+        label_parts = []
+        if show_class_names:
+            label_parts.append(class_name)
+        if show_confidence:
+            label_parts.append(f"{confidence:.2f}")
+        
+        if label_parts:
+            label = " | ".join(label_parts)
+            
+            # Add text background for better readability
+            ax.text(
+                x1, y1 - 5, label,
+                bbox=dict(boxstyle="round,pad=0.3", facecolor=color, alpha=0.7),
+                fontsize=10, color='white', weight='bold'
+            )
+        
+        detection_count += 1
+    
+    # Add summary text
+    ax.text(
+        0.02, 0.98, f"Detections found: {detection_count}", 
+        transform=ax.transAxes, 
+        fontsize=12, 
+        verticalalignment='top',
+        bbox=dict(boxstyle="round,pad=0.3", facecolor='black', alpha=0.7),
+        color='white', weight='bold'
+    )
+    
+    plt.tight_layout()
+    return fig
+
+
+def plot_detections_pil(
+    image: Union[np.ndarray, Image.Image], 
+    detections: List[Dict[str, Any]],
+    original_shape: Optional[Tuple[int, int]] = None,
+    letterbox_shape: Tuple[int, int] = (1280, 1280),
+    confidence_threshold: float = 0.0,
+    show_confidence: bool = True,
+    show_class_names: bool = True,
+    box_width: int = 3
+) -> Image.Image:
+    """
+    Plot detection results on an image using PIL (faster, no matplotlib dependency).
+    
+    Args:
+        image (np.ndarray or PIL.Image): Original image to plot detections on
+        detections (List[Dict]): List of detection dictionaries from AnimalDetector.detect()
+        original_shape (Tuple[int, int], optional): Original image shape (height, width).
+            If None, inferred from image.
+        letterbox_shape (Tuple[int, int]): Shape used during preprocessing (height, width)
+        confidence_threshold (float): Only show detections above this threshold
+        show_confidence (bool): Whether to show confidence scores in labels
+        show_class_names (bool): Whether to show class names in labels
+        box_width (int): Width of bounding box lines
+        
+    Returns:
+        PIL.Image: Image with detection boxes drawn
+        
+    Example:
+        >>> from PIL import Image
+        >>> # Load original image and detection results
+        >>> img = Image.open("wildlife_photo.jpg")
+        >>> detections = [...]  # From AnimalDetector.detect()
+        >>> result_img = plot_detections_pil(img, detections)
+        >>> result_img.save("detection_results.jpg")
+    """
+    # Convert to PIL Image if needed
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+        if original_shape is None:
+            original_shape = (image.shape[0], image.shape[1])
+    else:
+        if original_shape is None:
+            original_shape = (image.height, image.width)
+    
+    # Create a copy to draw on
+    result_image = image.copy()
+    draw = ImageDraw.Draw(result_image)
+    
+    # Try to load a font
+    try:
+        font = ImageFont.truetype("arial.ttf", 16)
+    except:
+        try:
+            font = ImageFont.load_default()
+        except:
+            font = None
+    
+    # Color mapping (RGB tuples)
+    class_colors = {
+        'animal': (255, 107, 107),    # Red
+        'person': (78, 205, 196),     # Teal
+        'vehicle': (69, 183, 209),    # Blue
+        'unknown': (150, 206, 180)    # Green
+    }
+    
+    # Plot each detection
+    detection_count = 0
+    for detection in detections:
+        confidence = detection.get('confidence', 0.0)
+        
+        # Skip low confidence detections
+        if confidence < confidence_threshold:
+            continue
+            
+        # Get detection info
+        bbox_letterbox = detection['bbox']
+        class_name = detection.get('class_name', 'unknown')
+        
+        # Convert coordinates back to original image space
+        bbox_original = reverse_letterbox_coords(bbox_letterbox, original_shape, letterbox_shape)
+        x1, y1, x2, y2 = bbox_original
+        
+        # Skip very small or invalid boxes
+        if x2 - x1 <= 0 or y2 - y1 <= 0:
+            continue
+            
+        # Get color for this class
+        color = class_colors.get(class_name, class_colors['unknown'])
+        
+        # Draw bounding box
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=box_width)
+        
+        # Create label
+        label_parts = []
+        if show_class_names:
+            label_parts.append(class_name)
+        if show_confidence:
+            label_parts.append(f"{confidence:.2f}")
+        
+        if label_parts:
+            label = " | ".join(label_parts)
+            
+            # Calculate text size and position
+            if font:
+                bbox = draw.textbbox((0, 0), label, font=font)
+                text_width = bbox[2] - bbox[0]
+                text_height = bbox[3] - bbox[1]
+            else:
+                text_width = len(label) * 8  # Rough estimation
+                text_height = 14
+            
+            # Draw text background
+            text_x = x1
+            text_y = max(0, y1 - text_height - 5)
+            draw.rectangle(
+                [text_x, text_y, text_x + text_width + 6, text_y + text_height + 4], 
+                fill=color
+            )
+            
+            # Draw text
+            draw.text(
+                (text_x + 3, text_y + 2), label, 
+                fill='white', font=font
+            )
+        
+        detection_count += 1
+    
+    return result_image
+
+
+def save_detection_visualization(
+    image: Union[np.ndarray, Image.Image], 
+    detections: List[Dict[str, Any]],
+    output_path: str,
+    original_shape: Optional[Tuple[int, int]] = None,
+    letterbox_shape: Tuple[int, int] = (1280, 1280),
+    confidence_threshold: float = 0.2,
+    use_matplotlib: bool = False,
+    **kwargs
+) -> None:
+    """
+    Save detection visualization to file.
+    
+    Args:
+        image (np.ndarray or PIL.Image): Original image
+        detections (List[Dict]): Detection results
+        output_path (str): Path to save the visualization
+        original_shape (Tuple[int, int], optional): Original image shape (height, width)
+        letterbox_shape (Tuple[int, int]): Letterbox shape used during preprocessing
+        confidence_threshold (float): Minimum confidence to display
+        use_matplotlib (bool): If True, use matplotlib for plotting (higher quality)
+        **kwargs: Additional arguments passed to plotting functions
+        
+    Example:
+        >>> save_detection_visualization(
+        ...     image, detections, "results.jpg", 
+        ...     confidence_threshold=0.5
+        ... )
+    """
+    if use_matplotlib:
+        fig = plot_detections_matplotlib(
+            image, detections, original_shape, letterbox_shape, 
+            confidence_threshold, **kwargs
+        )
+        fig.savefig(output_path, dpi=300, bbox_inches='tight')
+        plt.close(fig)
+    else:
+        result_image = plot_detections_pil(
+            image, detections, original_shape, letterbox_shape, 
+            confidence_threshold, **kwargs
+        )
+        result_image.save(output_path)