Zipstack · muhammad-ali-e · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/backend/workflow_manager/endpoint_v2/source.py b/backend/workflow_manager/endpoint_v2/source.py
@@ -66,6 +66,25 @@ class SourceConnector(BaseConnector):
 
     READ_CHUNK_SIZE = 4194304  # Chunk size for reading files
 
+    @classmethod
+    def _detect_mime_type(cls, chunk: bytes, file_name: str) -> str:
+        """Detect MIME type from file chunk using Python Magic.
+
+        Args:
+            chunk (bytes): File chunk to analyze
+            file_name (str): Name of the file being processed
+
+        Returns:
+            str: Detected MIME type (may be unsupported)
+        """
+        # Primary MIME type detection using Python Magic
+        mime_type = magic.from_buffer(chunk, mime=True)
+        logger.info(
+            f"Detected MIME type using Python Magic: {mime_type} for file {file_name}"
+        )
+
+        return mime_type
+
     def __init__(
         self,
         workflow: Workflow,
@@ -885,6 +904,117 @@ def load_file(self, input_file_path: str) -> tuple[str, BytesIO]:
 
         return os.path.basename(input_file_path), file_stream
 
+    @classmethod
+    def _process_file_chunks(
+        cls, file: UploadedFile, file_storage, destination_path: str
+    ) -> tuple[str, str, bool]:
+        """Process file chunks and detect MIME type.
+
+        Args:
+            file: The uploaded file to process
+            file_storage: File storage instance
+            destination_path: Path where file should be stored
+
+        Returns:
+            tuple: (file_hash, mime_type, success) where success indicates if processing completed
+        """
+        file_hash = sha256()
+        first_iteration = True
+        mime_type = file.content_type
+
+        file.seek(0)
+
+        try:
+            for chunk in file.chunks(chunk_size=cls.READ_CHUNK_SIZE):
+                if first_iteration:
+                    mime_type = cls._detect_mime_type(chunk, file.name)
+                    if not AllowedFileTypes.is_allowed(mime_type):
+                        raise UnsupportedMimeTypeError(
+                            f"Unsupported MIME type '{mime_type}' for file '{file.name}'"
+                        )
+                    first_iteration = False
+
+                file_hash.update(chunk)
+                file_storage.write(path=destination_path, mode="ab", data=chunk)
+
+            return file_hash.hexdigest(), mime_type, True
+
+        except UnsupportedMimeTypeError:
+            return "", mime_type, False
+
+    @classmethod
+    def _create_file_hash_object(
+        cls,
+        file_path: str,
+        connection_type,
+        file_name: str,
+        file_hash: str,
+        is_executed: bool,
+        file_size: int,
+        mime_type: str,
+    ) -> FileHash:
+        """Create FileHash object with provided parameters."""
+        return FileHash(
+            file_path=file_path,
+            source_connection_type=connection_type,
+            file_name=file_name,
+            file_hash=file_hash,
+            is_executed=is_executed,
+            file_size=file_size,
+            mime_type=mime_type,
+        )
+
+    @classmethod
+    def _handle_unsupported_file(
+        cls,
+        file_name: str,
+        mime_type: str,
+        destination_path: str,
+        connection_type,
+        file_size: int,
+        workflow_log,
+    ) -> FileHash:
+        """Handle files with unsupported MIME types."""
+        log_message = f"Skipping file '{file_name}' to stage due to unsupported MIME type '{mime_type}'"
+        workflow_log.log_info(logger=logger, message=log_message)
+
+        fake_hash = f"temp-hash-{uuid.uuid4().hex}"
+        return cls._create_file_hash_object(
+            file_path=destination_path,
+            connection_type=connection_type,
+            file_name=file_name,
+            file_hash=fake_hash,
+            is_executed=True,
+            file_size=file_size,
+            mime_type=mime_type,
+        )
+
+    @classmethod
+    def _check_duplicate_file(
+        cls, file_hash: str, unique_file_hashes: set[str], file_name: str, workflow_log
+    ) -> bool:
+        """Check if file is duplicate and log if needed."""
+        if file_hash in unique_file_hashes:
+            log_message = f"Skipping file '{file_name}' — duplicate detected within the current request. Already staged for processing."
+            workflow_log.log_info(logger=logger, message=log_message)
+            return True
+
+        unique_file_hashes.add(file_hash)
+        return False
+
+    @classmethod
+    def _get_execution_status(
+        cls, use_file_history: bool, workflow, file_hash: str
+    ) -> bool:
+        """Get execution status for file based on history."""
+        if not use_file_history:
+            return False
+
+        file_history = FileHistoryHelper.get_file_history(
+            workflow=workflow, cache_key=file_hash
+        )
+        return True if file_history and file_history.is_completed() else False
+
     @classmethod
     def add_input_file_to_api_storage(
         cls,
@@ -919,65 +1049,61 @@ def add_input_file_to_api_storage(
             workflow_id=workflow_id, execution_id=execution_id
         )
         workflow: Workflow = Workflow.objects.get(id=workflow_id)
-        file_hashes: dict[str, FileHash] = {}
+        file_hashes_objects: dict[str, FileHash] = {}
         unique_file_hashes: set[str] = set()
         connection_type = WorkflowEndpoint.ConnectionType.API
+
         for file in file_objs:
             file_name = file.name
             destination_path = os.path.join(api_storage_dir, file_name)
 
-            mime_type = file.content_type
-            logger.info(f"Detected MIME type: {mime_type} for file {file_name}")
-            if not AllowedFileTypes.is_allowed(mime_type):
-                log_message = f"Skipping file '{file_name}' to stage due to unsupported MIME type '{mime_type}'"
-                workflow_log.log_info(logger=logger, message=log_message)
-                # Generate a clearly marked temporary hash to avoid reading the file content
-                # Helps to prevent duplicate entries in file executions
-                fake_hash = f"temp-hash-{uuid.uuid4().hex}"
-                file_hash = FileHash(
-                    file_path=destination_path,
-                    source_connection_type=connection_type,
-                    file_name=file_name,
-                    file_hash=fake_hash,
-                    is_executed=True,
-                    file_size=file.size,
-                    mime_type=mime_type,
-                )
-                file_hashes.update({file_name: file_hash})
-                continue
+            logger.info(
+                f"Detected MIME type from content type header: {file.content_type} for file {file_name}"
+            )
 
             file_system = FileSystem(FileStorageType.API_EXECUTION)
             file_storage = file_system.get_file_storage()
-            file_hash = sha256()
-            for chunk in file.chunks(chunk_size=cls.READ_CHUNK_SIZE):
-                file_hash.update(chunk)
-                file_storage.write(path=destination_path, mode="ab", data=chunk)
-            file_hash = file_hash.hexdigest()
 
-            # Skip duplicate files
-            if file_hash in unique_file_hashes:
-                log_message = f"Skipping file '{file_name}' — duplicate detected within the current request. Already staged for processing."
-                workflow_log.log_info(logger=logger, message=log_message)
-                continue
-            unique_file_hashes.add(file_hash)
+            # Process file chunks and detect MIME type
+            file_hash, mime_type, success = cls._process_file_chunks(
+                file, file_storage, destination_path
+            )
 
-            file_history = None
-            if use_file_history:
-                file_history = FileHistoryHelper.get_file_history(
-                    workflow=workflow, cache_key=file_hash
+            # Handle unsupported files
+            if not success:
+                file_hash_object = cls._handle_unsupported_file(
+                    file_name,
+                    mime_type,
+                    destination_path,
+                    connection_type,
+                    file.size,
+                    workflow_log,
                 )
-            is_executed = True if file_history and file_history.is_completed() else False
-            file_hash = FileHash(
-                file_path=destination_path,
-                source_connection_type=connection_type,
-                file_name=file_name,
-                file_hash=file_hash,
-                is_executed=is_executed,
-                file_size=file.size,
-                mime_type=mime_type,
+                file_hashes_objects.update({file_name: file_hash_object})
+                continue
+
+            # Check for duplicates
+            if cls._check_duplicate_file(
+                file_hash, unique_file_hashes, file_name, workflow_log
+            ):
+                continue
+
+            # Get execution status
+            is_executed = cls._get_execution_status(use_file_history, workflow, file_hash)
+
+            # Create file hash object
+            file_hash_object = cls._create_file_hash_object(
+                destination_path,
+                connection_type,
+                file_name,
+                file_hash,
+                is_executed,
+                file.size,
+                mime_type,
             )
-            file_hashes.update({file_name: file_hash})
-        return file_hashes
+            file_hashes_objects.update({file_name: file_hash_object})
+
+        return file_hashes_objects
 
     @classmethod
     def create_endpoint_for_workflow(