diff --git a/backend/platform_settings_v2/migrations/__init__.py b/backend/platform_settings_v2/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/platform_settings_v2/models.py b/backend/platform_settings_v2/models.py index e69de29bb2..ff2f13f858 100644 --- a/backend/platform_settings_v2/models.py +++ b/backend/platform_settings_v2/models.py @@ -0,0 +1,73 @@ +import uuid + +from adapter_processor_v2.models import AdapterInstance +from django.db import models +from utils.models.base_model import BaseModel +from utils.models.organization_mixin import ( + DefaultOrganizationManagerMixin, + DefaultOrganizationMixin, +) + + +class PlatformSettingsModelManager(DefaultOrganizationManagerMixin, models.Manager): + """Manager for PlatformSettings model.""" + + pass + + +class PlatformSettings(DefaultOrganizationMixin, BaseModel): + """Platform-level settings for an organization. + + This model stores organization-wide settings including the system LLM + adapter that will be used for platform operations like vibe extractor + prompt generation. + """ + + id = models.UUIDField( + primary_key=True, + default=uuid.uuid4, + editable=False, + db_comment="Unique identifier for the platform settings", + ) + + # System LLM for platform operations (e.g., vibe extractor, prompt generation) + system_llm_adapter = models.ForeignKey( + AdapterInstance, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="platform_system_llm", + db_comment="System LLM adapter used for platform-level AI operations like prompt generation", + ) + + objects = PlatformSettingsModelManager() + + class Meta: + verbose_name = "Platform Setting" + verbose_name_plural = "Platform Settings" + db_table = "platform_settings" + constraints = [ + models.UniqueConstraint( + fields=["organization"], + name="unique_organization_platform_settings", + ), + ] + + def __str__(self) -> str: + return f"PlatformSettings({self.organization})" + + @classmethod + def get_for_organization(cls, organization): + """Get or create platform settings for an organization. + + Args: + organization: Organization instance + + Returns: + PlatformSettings instance + """ + settings, created = cls.objects.get_or_create( + organization=organization, + defaults={}, + ) + return settings diff --git a/backend/platform_settings_v2/serializers.py b/backend/platform_settings_v2/serializers.py index 24bd93ec5c..3212b4d550 100644 --- a/backend/platform_settings_v2/serializers.py +++ b/backend/platform_settings_v2/serializers.py @@ -1,7 +1,16 @@ from account_v2.models import PlatformKey +from adapter_processor_v2.models import AdapterInstance from rest_framework import serializers +from rest_framework.exceptions import ValidationError from backend.serializers import AuditSerializer +from platform_settings_v2.models import PlatformSettings +from unstract.flags.feature_flag import check_feature_flag_status + +if check_feature_flag_status("sdk1"): + from unstract.sdk1.adapters.enums import AdapterTypes +else: + from unstract.sdk.adapters.enums import AdapterTypes class PlatformKeySerializer(AuditSerializer): @@ -22,3 +31,50 @@ class PlatformKeyIDSerializer(serializers.Serializer): key_name = serializers.CharField() key = serializers.CharField() is_active = serializers.BooleanField() + + +class PlatformSettingsSerializer(AuditSerializer): + """Serializer for PlatformSettings model.""" + + system_llm_adapter = serializers.PrimaryKeyRelatedField( + queryset=AdapterInstance.objects.all(), + required=False, + allow_null=True, + ) + + class Meta: + model = PlatformSettings + fields = [ + "id", + "organization", + "system_llm_adapter", + "created_at", + "modified_at", + ] + read_only_fields = ["id", "organization", "created_at", "modified_at"] + + def validate_system_llm_adapter(self, value): + """Validate that the adapter type is LLM and is accessible to the user.""" + if value is None: + return value + + # Check if user has access to this adapter + request = self.context.get("request") + if request and hasattr(request, "user"): + try: + adapter = AdapterInstance.objects.for_user(request.user).get(id=value.id) + # Validate that the adapter type is LLM + if adapter.adapter_type != AdapterTypes.LLM.value: + raise ValidationError("Only LLM adapters are allowed for system LLM") + + # Validate that adapter is usable and active + if not adapter.is_usable: + raise ValidationError("Selected LLM adapter is not usable") + + if not adapter.is_active: + raise ValidationError("Selected LLM adapter is not active") + + except AdapterInstance.DoesNotExist: + raise ValidationError("Selected LLM adapter not found or not accessible") + + return value diff --git a/backend/platform_settings_v2/urls.py b/backend/platform_settings_v2/urls.py index feb1f5cc1e..ccad3f935c 100644 --- a/backend/platform_settings_v2/urls.py +++ b/backend/platform_settings_v2/urls.py @@ -1,7 +1,7 @@ from django.urls import path from rest_framework.urlpatterns import format_suffix_patterns -from .views import PlatformKeyViewSet +from .views import PlatformKeyViewSet, PlatformSettingsViewSet platform_key_list = PlatformKeyViewSet.as_view( {"post": "create", "put": "refresh", "get": "list"} @@ -10,6 +10,12 @@ {"put": "toggle_platform_key", "delete": "destroy"} ) +platform_settings_view = PlatformSettingsViewSet.as_view( + {"get": "list", "put": "update", "patch": "update"} +) + +platform_settings_system_llm = PlatformSettingsViewSet.as_view({"get": "system_llm"}) + urlpatterns = format_suffix_patterns( [ path( @@ -22,5 +28,15 @@ platform_key_update, name="update_platform_key", ), + path( + "settings/", + platform_settings_view, + name="platform_settings", + ), + path( + "settings/system-llm/", + platform_settings_system_llm, + name="platform_settings_system_llm", + ), ] ) diff --git a/backend/platform_settings_v2/views.py b/backend/platform_settings_v2/views.py index 41610b60c5..1674f8e51c 100644 --- a/backend/platform_settings_v2/views.py +++ b/backend/platform_settings_v2/views.py @@ -5,17 +5,20 @@ from account_v2.models import Organization, PlatformKey from rest_framework import status, viewsets +from rest_framework.decorators import action from rest_framework.request import Request from rest_framework.response import Response from utils.user_context import UserContext from platform_settings_v2.constants import PlatformServiceConstants +from platform_settings_v2.models import PlatformSettings from platform_settings_v2.platform_auth_helper import PlatformAuthHelper from platform_settings_v2.platform_auth_service import PlatformAuthenticationService from platform_settings_v2.serializers import ( PlatformKeyGenerateSerializer, PlatformKeyIDSerializer, PlatformKeySerializer, + PlatformSettingsSerializer, ) logger = logging.getLogger(__name__) @@ -123,3 +126,79 @@ def create(self, request: Request) -> Response: status=status.HTTP_201_CREATED, data=serialized_data, ) + + +class PlatformSettingsViewSet(viewsets.ModelViewSet): + """ViewSet for managing platform settings.""" + + serializer_class = PlatformSettingsSerializer + + def get_queryset(self): + """Get platform settings for the user's organization.""" + organization = UserContext.get_organization() + return PlatformSettings.objects.filter(organization=organization) + + def get_object(self): + """Get or create platform settings for the user's organization.""" + organization = UserContext.get_organization() + settings, created = PlatformSettings.objects.get_or_create( + organization=organization + ) + return settings + + def list( + self, request: Request, *args: tuple[Any], **kwargs: dict[str, Any] + ) -> Response: + """List platform settings for the organization.""" + settings = self.get_object() + serializer = self.get_serializer(settings) + return Response(serializer.data) + + def retrieve( + self, request: Request, *args: tuple[Any], **kwargs: dict[str, Any] + ) -> Response: + """Retrieve platform settings for the organization.""" + settings = self.get_object() + serializer = self.get_serializer(settings) + return Response(serializer.data) + + def update( + self, request: Request, *args: tuple[Any], **kwargs: dict[str, Any] + ) -> Response: + """Update platform settings.""" + settings = self.get_object() + serializer = self.get_serializer( + settings, data=request.data, partial=True, context={"request": request} + ) + serializer.is_valid(raise_exception=True) + serializer.save() + return Response(serializer.data) + + @action(detail=False, methods=["get"]) + def system_llm(self, request: Request) -> Response: + """Get the configured system LLM adapter for the organization. + + Returns: + Response with system LLM adapter details or null if not configured + """ + settings = self.get_object() + if settings.system_llm_adapter: + from adapter_processor_v2.serializers import AdapterInstanceSerializer + + adapter_serializer = AdapterInstanceSerializer(settings.system_llm_adapter) + return Response( + { + "system_llm_adapter": adapter_serializer.data, + "is_configured": True, + }, + status=status.HTTP_200_OK, + ) + else: + return Response( + { + "system_llm_adapter": None, + "is_configured": False, + "message": "No system LLM adapter configured for this organization", + }, + status=status.HTTP_200_OK, + ) diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/__init__.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/admin.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/admin.py new file mode 100644 index 0000000000..b083210e1c --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/admin.py @@ -0,0 +1,30 @@ +from django.contrib import admin + +from prompt_studio.prompt_studio_vibe_extractor_v2.models import ( + VibeExtractorProject, +) + + +@admin.register(VibeExtractorProject) +class VibeExtractorProjectAdmin(admin.ModelAdmin): + """Admin interface for VibeExtractorProject.""" + + list_display = [ + "project_id", + "document_type", + "status", + "tool_id", + "created_at", + "modified_at", + ] + list_filter = ["status", "created_at"] + search_fields = ["document_type", "project_id"] + readonly_fields = [ + "project_id", + "generation_output_path", + "generation_progress", + "created_by", + "modified_by", + "created_at", + "modified_at", + ] diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/apps.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/apps.py new file mode 100644 index 0000000000..dc03bc3a2a --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class PromptStudioVibeExtractorV2Config(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "prompt_studio.prompt_studio_vibe_extractor_v2" + verbose_name = "Prompt Studio Vibe Extractor V2" diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/constants.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/constants.py new file mode 100644 index 0000000000..1fd5b19fee --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/constants.py @@ -0,0 +1,44 @@ +"""Constants for Vibe Extractor.""" + + +class VibeExtractorKeys: + """Keys for Vibe Extractor API requests and responses.""" + + PROJECT_ID = "project_id" + DOCUMENT_TYPE = "document_type" + STATUS = "status" + GENERATION_OUTPUT_PATH = "generation_output_path" + ERROR_MESSAGE = "error_message" + GENERATION_PROGRESS = "generation_progress" + TOOL_ID = "tool_id" + + +class VibeExtractorFileNames: + """File names for generated files.""" + + METADATA_YAML = "metadata.yaml" + EXTRACTION_YAML = "extraction.yaml" + PAGE_EXTRACTION_SYSTEM_MD = "page-extraction-system.md" + PAGE_EXTRACTION_USER_MD = "page-extraction-user.md" + SCALARS_EXTRACTION_SYSTEM_MD = "extraction-scalars-system.md" + SCALARS_EXTRACTION_USER_MD = "extraction-scalars-user.md" + TABLES_EXTRACTION_SYSTEM_MD = "extraction-table-system.md" + TABLES_EXTRACTION_USER_MD = "extraction-table-user.md" + + +class VibeExtractorPaths: + """Path constants for Vibe Extractor.""" + + PROMPTS_DIR = "prompts" + STAGING_DIR = "staging" + REFERENCE_DIR = "reference" + + +class GenerationSteps: + """Steps in the generation process.""" + + METADATA = "metadata" + EXTRACTION_FIELDS = "extraction_fields" + PAGE_EXTRACTION_PROMPTS = "page_extraction_prompts" + SCALARS_EXTRACTION_PROMPTS = "scalars_extraction_prompts" + TABLES_EXTRACTION_PROMPTS = "tables_extraction_prompts" diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/exceptions.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/exceptions.py new file mode 100644 index 0000000000..f521afc1fd --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/exceptions.py @@ -0,0 +1,31 @@ +"""Exceptions for Vibe Extractor.""" + + +class VibeExtractorError(Exception): + """Base exception for Vibe Extractor errors.""" + + pass + + +class ProjectNotFoundError(VibeExtractorError): + """Raised when a project is not found.""" + + pass + + +class GenerationError(VibeExtractorError): + """Raised when generation fails.""" + + pass + + +class FileReadError(VibeExtractorError): + """Raised when reading a generated file fails.""" + + pass + + +class InvalidDocumentTypeError(VibeExtractorError): + """Raised when document type is invalid.""" + + pass diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/migrations/__init__.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/models.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/models.py new file mode 100644 index 0000000000..452e065bb4 --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/models.py @@ -0,0 +1,65 @@ +import uuid + +from account_v2.models import User +from adapter_processor_v2.models import AdapterInstance +from django.db import models +from utils.models.base_model import BaseModel + +from prompt_studio.prompt_studio_core_v2.models import CustomTool + + +class VibeExtractorProject(BaseModel): + """Model to store Vibe Extractor project metadata. + + This stores the document type and tracks the generation process. + All generated content (metadata.yaml, extraction.yaml, prompts) + will be stored as files in the repository. + """ + + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + document_type = models.TextField( + blank=False, + db_comment="Document type name (e.g., invoice, receipt)", + ) + llm_adapter = models.ForeignKey( + AdapterInstance, + on_delete=models.SET_NULL, + related_name="vibe_extractor_projects_llm", + null=True, + blank=True, + db_comment="LLM adapter used for generation (from platform system LLM)", + ) + tool_id = models.ForeignKey( + CustomTool, + on_delete=models.SET_NULL, + related_name="vibe_extractor_projects", + null=True, + blank=True, + db_comment="Associated custom tool", + ) + created_by = models.ForeignKey( + User, + on_delete=models.SET_NULL, + related_name="vibe_extractor_projects_created", + null=True, + blank=True, + editable=False, + ) + modified_by = models.ForeignKey( + User, + on_delete=models.SET_NULL, + related_name="vibe_extractor_projects_modified", + null=True, + blank=True, + editable=False, + ) + + class Meta: + verbose_name = "Vibe Extractor Project" + verbose_name_plural = "Vibe Extractor Projects" + db_table = "vibe_extractor_project" + indexes = [ + models.Index(fields=["document_type"]), + models.Index(fields=["status"]), + models.Index(fields=["tool_id"]), + ] diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/serializers.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/serializers.py new file mode 100644 index 0000000000..548b724e3b --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/serializers.py @@ -0,0 +1,137 @@ +from rest_framework import serializers + +from prompt_studio.prompt_studio_vibe_extractor_v2.models import ( + VibeExtractorProject, +) + + +class VibeExtractorProjectSerializer(serializers.ModelSerializer): + """Serializer for VibeExtractorProject model.""" + + class Meta: + model = VibeExtractorProject + fields = [ + "id", + "document_type", + "llm_adapter", + "tool_id", + "created_by", + "modified_by", + "created_at", + "modified_at", + ] + read_only_fields = [ + "id", + "created_by", + "modified_by", + "created_at", + "modified_at", + ] + + +class VibeExtractorProjectCreateSerializer(serializers.Serializer): + """Serializer for creating a new Vibe Extractor project.""" + + document_type = serializers.CharField( + required=True, + help_text="Document type name (e.g., invoice, receipt)", + ) + tool_id = serializers.UUIDField( + required=False, + allow_null=True, + help_text="Associated custom tool ID", + ) + + +class VibeExtractorGenerateSerializer(serializers.Serializer): + """Serializer for triggering generation for a project.""" + + regenerate = serializers.BooleanField( + default=False, + help_text="Whether to regenerate if files already exist", + ) + + +class VibeExtractorFileReadSerializer(serializers.Serializer): + """Serializer for reading generated files.""" + + file_type = serializers.ChoiceField( + choices=[ + "metadata", + "extraction", + "page_extraction_system", + "page_extraction_user", + "scalars_extraction_system", + "scalars_extraction_user", + "tables_extraction_system", + "tables_extraction_user", + ], + required=True, + help_text="Type of file to read", + ) + + +class VibeExtractorGenerateMetadataSerializer(serializers.Serializer): + """Serializer for generating metadata only.""" + + regenerate = serializers.BooleanField( + default=False, + help_text="Whether to regenerate if metadata already exists", + ) + + +class VibeExtractorGenerateExtractionFieldsSerializer(serializers.Serializer): + """Serializer for generating extraction fields.""" + + metadata = serializers.JSONField( + required=True, + help_text="Metadata dictionary to use for generation", + ) + + +class VibeExtractorGeneratePagePromptsSerializer(serializers.Serializer): + """Serializer for generating page extraction prompts.""" + + metadata = serializers.JSONField( + required=True, + help_text="Metadata dictionary to use for generation", + ) + + +class VibeExtractorGenerateScalarPromptsSerializer(serializers.Serializer): + """Serializer for generating scalar extraction prompts.""" + + metadata = serializers.JSONField( + required=True, + help_text="Metadata dictionary to use for generation", + ) + extraction_yaml = serializers.CharField( + required=True, + help_text="Extraction YAML content", + ) + + +class VibeExtractorGenerateTablePromptsSerializer(serializers.Serializer): + """Serializer for generating table extraction prompts.""" + + metadata = serializers.JSONField( + required=True, + help_text="Metadata dictionary to use for generation", + ) + extraction_yaml = serializers.CharField( + required=True, + help_text="Extraction YAML content", + ) + + +class VibeExtractorGuessDocumentTypeSerializer(serializers.Serializer): + """Serializer for guessing document type from file.""" + + file_name = serializers.CharField( + required=True, + help_text="Name of the file in permanent storage", + ) + tool_id = serializers.UUIDField( + required=True, + help_text="Tool ID to construct the file path", + ) diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/__init__.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/__init__.py new file mode 100644 index 0000000000..662d45ff25 --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/__init__.py @@ -0,0 +1 @@ +"""Services for Vibe Extractor V2.""" diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/adapter_helper.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/adapter_helper.py new file mode 100644 index 0000000000..f9acb7a68b --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/adapter_helper.py @@ -0,0 +1,180 @@ +"""Adapter Helper for Vibe Extractor. + +This module converts platform AdapterInstance to autogen-compatible LLM configuration. +""" + +import logging +from typing import Any + +from adapter_processor_v2.models import AdapterInstance + +logger = logging.getLogger(__name__) + + +class AdapterHelper: + """Helper to convert AdapterInstance to LLM configuration.""" + + # Mapping of adapter_id to autogen adapter_id + ADAPTER_ID_MAPPING = { + # OpenAI adapters + "openai": "openai", + "openai-llm": "openai", + # Azure OpenAI adapters + "azure-openai": "azureopenai", + "azureopenai": "azureopenai", + # Anthropic adapters + "anthropic": "anthropic", + "claude": "anthropic", + # Bedrock adapters + "bedrock": "bedrock", + "aws-bedrock": "bedrock", + } + + @staticmethod + def get_autogen_adapter_id(adapter_id: str) -> str: + """Get autogen-compatible adapter ID. + + Args: + adapter_id: Platform adapter ID + + Returns: + Autogen adapter ID (openai, azureopenai, anthropic, bedrock) + """ + # Normalize adapter_id + normalized_id = adapter_id.lower().strip() + + # Check mapping + for key, value in AdapterHelper.ADAPTER_ID_MAPPING.items(): + if key in normalized_id: + return value + + # Default to openai if not found + logger.warning(f"Unknown adapter_id: {adapter_id}. Defaulting to 'openai'") + return "openai" + + @staticmethod + def convert_to_llm_config(adapter: AdapterInstance) -> dict[str, Any]: + """Convert AdapterInstance to autogen LLM configuration. + + Args: + adapter: AdapterInstance from platform + + Returns: + LLM configuration dictionary for autogen + + Raises: + ValueError: If adapter type is not LLM + """ + # Validate adapter type + if adapter.adapter_type != "LLM": + raise ValueError(f"Adapter must be of type LLM, got: {adapter.adapter_type}") + + # Get decrypted metadata + metadata = adapter.metadata + + # Get autogen adapter ID + autogen_adapter_id = AdapterHelper.get_autogen_adapter_id(adapter.adapter_id) + + # Base configuration + llm_config = { + "adapter_id": autogen_adapter_id, + "model": metadata.get("model", metadata.get("deployment", "gpt-4")), + "temperature": float(metadata.get("temperature", 0.7)), + "max_tokens": int(metadata.get("max_tokens", 4096)), + } + + # Provider-specific configuration + if autogen_adapter_id == "openai": + llm_config["api_key"] = metadata.get("api_key", "") + if "api_base" in metadata: + llm_config["api_base"] = metadata["api_base"] + if "timeout" in metadata: + llm_config["timeout"] = int(metadata["timeout"]) + if "max_retries" in metadata: + llm_config["max_retries"] = int(metadata["max_retries"]) + + elif autogen_adapter_id == "azureopenai": + llm_config["api_key"] = metadata.get("api_key", "") + llm_config["api_base"] = metadata.get( + "azure_endpoint", metadata.get("api_base", "") + ) + llm_config["api_version"] = metadata.get("api_version", "2024-02-15-preview") + llm_config["deployment"] = metadata.get("deployment", metadata.get("model")) + if "timeout" in metadata: + llm_config["timeout"] = int(metadata["timeout"]) + + elif autogen_adapter_id == "anthropic": + llm_config["api_key"] = metadata.get("api_key", "") + if "api_base" in metadata: + llm_config["api_base"] = metadata["api_base"] + + elif autogen_adapter_id == "bedrock": + llm_config["aws_access_key_id"] = metadata.get("aws_access_key_id", "") + llm_config["aws_secret_access_key"] = metadata.get( + "aws_secret_access_key", "" + ) + llm_config["region_name"] = metadata.get("region_name", "us-east-1") + if "max_retries" in metadata: + llm_config["max_retries"] = int(metadata["max_retries"]) + if "budget_tokens" in metadata: + llm_config["budget_tokens"] = int(metadata["budget_tokens"]) + if "timeout" in metadata: + llm_config["timeout"] = int(metadata["timeout"]) + + # Add provider for tracking + llm_config["provider"] = adapter.adapter_id + + return llm_config + + @staticmethod + def validate_llm_adapter(adapter: AdapterInstance) -> tuple[bool, str]: + """Validate that adapter is suitable for vibe extraction. + + Args: + adapter: AdapterInstance to validate + + Returns: + Tuple of (is_valid, error_message) + """ + # Check adapter type + if adapter.adapter_type != "LLM": + return False, f"Adapter must be of type LLM, got: {adapter.adapter_type}" + + # Check if adapter is usable + if not adapter.is_usable: + return False, "Adapter is not usable" + + # Check if adapter is active + if not adapter.is_active: + return ( + False, + "Adapter is not active. Please activate it in platform settings.", + ) + + # Try to get metadata + try: + metadata = adapter.metadata + if not metadata: + return False, "Adapter metadata is empty" + except Exception as e: + return False, f"Error reading adapter metadata: {str(e)}" + + # Check for required fields + required_fields = ["model"] + autogen_adapter_id = AdapterHelper.get_autogen_adapter_id(adapter.adapter_id) + + if autogen_adapter_id in ["openai", "azureopenai", "anthropic"]: + required_fields.append("api_key") + elif autogen_adapter_id == "bedrock": + required_fields.extend( + ["aws_access_key_id", "aws_secret_access_key", "region_name"] + ) + + missing_fields = [field for field in required_fields if not metadata.get(field)] + if missing_fields: + return ( + False, + f"Missing required fields in adapter metadata: {', '.join(missing_fields)}", + ) + + return True, "" diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/generator_service.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/generator_service.py new file mode 100644 index 0000000000..b0751db2b1 --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/generator_service.py @@ -0,0 +1,275 @@ +"""Generator Service Integration. + +This module integrates with the prompt service to generate +document extraction components. +""" + +import asyncio +import logging +from typing import Any + +from adapter_processor_v2.models import AdapterInstance +from platform_settings_v2.models import PlatformSettings +from utils.user_context import UserContext + +from prompt_studio.prompt_studio_vibe_extractor_v2.models import ( + VibeExtractorProject, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.services.adapter_helper import ( + AdapterHelper, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.vibe_extractor_helper import ( + VibeExtractorHelper, +) + +logger = logging.getLogger(__name__) + + +class GeneratorService: + """Service to integrate with prompt service for generation.""" + + @staticmethod + def _get_system_llm_adapter() -> AdapterInstance: + """Get system LLM adapter from platform settings. + + Returns: + AdapterInstance configured as system LLM + + Raises: + ValueError: If system LLM is not configured + """ + try: + organization = UserContext.get_organization() + platform_settings = PlatformSettings.get_for_organization(organization) + + if not platform_settings.system_llm_adapter: + raise ValueError( + "No system LLM adapter configured for this organization. " + "Please configure a system LLM in platform settings." + ) + + # Validate the adapter + is_valid, error_msg = AdapterHelper.validate_llm_adapter( + platform_settings.system_llm_adapter + ) + if not is_valid: + raise ValueError(f"System LLM adapter is invalid: {error_msg}") + + return platform_settings.system_llm_adapter + + except Exception as e: + logger.error("Failed to get system LLM adapter: %s", str(e)) + raise ValueError(f"Failed to get system LLM adapter: {str(e)}") from e + + @staticmethod + def _get_llm_config( + project: VibeExtractorProject = None, + ) -> dict[str, Any]: + """Get LLM configuration from platform system LLM or project. + + Args: + project: Optional VibeExtractorProject to get LLM from + + Returns: + LLM configuration dictionary + + Raises: + ValueError: If LLM configuration is missing or invalid + """ + # If project has an LLM adapter, use it + if project and project.llm_adapter: + adapter = project.llm_adapter + else: + # Otherwise, get system LLM from platform settings + adapter = GeneratorService._get_system_llm_adapter() + + # Convert adapter to LLM config + try: + llm_config = AdapterHelper.convert_to_llm_config(adapter) + logger.info( + "Using LLM adapter: %s (model: %s)", + adapter.adapter_name, + llm_config.get("model"), + ) + return llm_config + except Exception as e: + error_msg = f"Failed to convert adapter to LLM config: {str(e)}" + logger.error(error_msg) + raise ValueError(error_msg) from e + + @staticmethod + def _get_reference_template() -> str: + """Get reference metadata.yaml template. + + Returns: + Reference template content + """ + try: + reference_template = VibeExtractorHelper.get_reference_template( + "metadata.yaml" + ) + return reference_template + except Exception as e: + logger.warning(f"Could not load reference template: {e}") + # Return default template + return """--- +name_identifier: example +name: Example Document +description: | + Example document description. +description_seo: | + SEO optimized description. +html_meta_description: | + HTML meta description. +tags: + - example +version: 1.0.0 +status: beta +visibility: public +author: Zipstack Inc +release_date: 2025-07-01 +price_multiplier: 1.0 +llm_model: claude-sonnet-1-7 +extraction_features: + locate_pages: true + rolling_window: false + challenge: false +""" + + @staticmethod + def _create_progress_callback(project: VibeExtractorProject): + """Create a progress callback for updating project status. + + Args: + project: VibeExtractorProject instance + + Returns: + Callback function + """ + + def progress_callback(step: str, status: str, message: str = ""): + """Update project progress. + + Args: + step: Generation step name + status: Status (in_progress, completed, failed) + message: Optional message + """ + try: + VibeExtractorHelper.update_generation_progress( + project, step, status, message + ) + + # Update project status based on step + if status == "failed": + project.status = VibeExtractorProject.Status.FAILED + project.error_message = message + project.save(update_fields=["status", "error_message", "modified_at"]) + elif step == "generating_metadata": + project.status = VibeExtractorProject.Status.GENERATING_METADATA + project.save(update_fields=["status", "modified_at"]) + elif step == "generating_extraction_fields": + project.status = VibeExtractorProject.Status.GENERATING_FIELDS + project.save(update_fields=["status", "modified_at"]) + elif step == "generating_page_prompts" or step.startswith("generating_"): + project.status = VibeExtractorProject.Status.GENERATING_PROMPTS + project.save(update_fields=["status", "modified_at"]) + + except Exception as e: + logger.error(f"Error in progress callback: {e}") + + return progress_callback + + @staticmethod + async def generate_all_async( + project: VibeExtractorProject, + ) -> dict[str, Any]: + """Generate all components for a project asynchronously. + + Args: + project: VibeExtractorProject instance + + Returns: + Dictionary containing generation result + """ + try: + # Import here to avoid circular imports and ensure prompt service is available + from unstract.prompt_service.services.vibe_extractor.api_helper import ( + generate_document_extraction_components_sync, + ) + + # Get system LLM adapter if not already set on project + if not project.llm_adapter: + system_llm = GeneratorService._get_system_llm_adapter() + project.llm_adapter = system_llm + project.save(update_fields=["llm_adapter"]) + + # Get LLM configuration + llm_config = GeneratorService._get_llm_config(project) + + # Get reference template + reference_template = GeneratorService._get_reference_template() + + # Get output directory + output_dir = VibeExtractorHelper.get_project_output_path(project) + + # Create progress callback + progress_callback = GeneratorService._create_progress_callback(project) + + # Generate all components + result = generate_document_extraction_components_sync( + doc_type=project.document_type, + output_dir=str(output_dir.parent), + llm_config=llm_config, + reference_template=reference_template, + progress_callback=progress_callback, + ) + + # Update project status based on result + if result["status"] == "success": + project.status = VibeExtractorProject.Status.COMPLETED + project.generation_output_path = result["output_path"] + project.error_message = "" + project.save( + update_fields=[ + "status", + "generation_output_path", + "error_message", + "modified_at", + ] + ) + else: + project.status = VibeExtractorProject.Status.FAILED + project.error_message = result.get("error", "Unknown error") + project.save(update_fields=["status", "error_message", "modified_at"]) + + return result + + except Exception as e: + error_msg = f"Error during generation: {str(e)}" + logger.error(error_msg, exc_info=True) + + project.status = VibeExtractorProject.Status.FAILED + project.error_message = error_msg + project.save(update_fields=["status", "error_message", "modified_at"]) + + return {"status": "error", "error": error_msg} + + @staticmethod + def generate_all(project: VibeExtractorProject) -> dict[str, Any]: + """Generate all components for a project (sync wrapper). + + Args: + project: VibeExtractorProject instance + + Returns: + Dictionary containing generation result + """ + # Run the async function in a new event loop + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + return loop.run_until_complete(GeneratorService.generate_all_async(project)) diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/prompt_service_helper.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/prompt_service_helper.py new file mode 100644 index 0000000000..89fa7d2a74 --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/services/prompt_service_helper.py @@ -0,0 +1,234 @@ +"""Helper to communicate with prompt-service for vibe extractor operations. + +This module provides a helper class that uses the SDK's PromptTool +to communicate with the prompt-service, following Unstract's standards. +""" + +import logging +from typing import Any + +from account_v2.constants import Common +from django.conf import settings +from utils.local_context import StateStore + +from prompt_studio.prompt_studio_core_v2.prompt_ide_base_tool import ( + PromptIdeBaseTool, +) +from unstract.flags.feature_flag import check_feature_flag_status + +if check_feature_flag_status("sdk1"): + from unstract.sdk1.constants import LogLevel + from unstract.sdk1.prompt import PromptTool +else: + from unstract.sdk.constants import LogLevel + from unstract.sdk.prompt import PromptTool + +logger = logging.getLogger(__name__) + + +class VibeExtractorPromptServiceHelper: + """Helper class to communicate with prompt-service for vibe extractor. + + This class follows Unstract's standard pattern of using PromptIdeBaseTool + with the SDK's PromptTool to make HTTP calls to the prompt-service. + """ + + @staticmethod + def _get_prompt_tool(org_id: str) -> PromptTool: + """Get configured PromptTool instance. + + Args: + org_id: Organization ID + + Returns: + Configured PromptTool instance + """ + # Create PromptIdeBaseTool (standard tool used in backend) + util = PromptIdeBaseTool(log_level=LogLevel.INFO, org_id=org_id) + + # Create PromptTool instance + prompt_tool = PromptTool( + tool=util, + prompt_host=settings.PROMPT_HOST, + prompt_port=settings.PROMPT_PORT, + request_id=StateStore.get(Common.REQUEST_ID), + ) + + return prompt_tool + + @staticmethod + def guess_document_type( + file_content: str, + llm_config: dict[str, Any], + org_id: str, + ) -> dict[str, Any]: + """Guess document type from file content. + + Args: + file_content: Extracted text content from document + llm_config: LLM configuration dictionary + org_id: Organization ID + + Returns: + Dictionary with status, document_type, confidence, etc. + """ + prompt_tool = VibeExtractorPromptServiceHelper._get_prompt_tool(org_id) + + payload = { + "file_content": file_content, + "llm_config": llm_config, + } + + return prompt_tool.guess_document_type(payload=payload) + + @staticmethod + def generate_metadata( + doc_type: str, + llm_config: dict[str, Any], + reference_template: str, + org_id: str, + ) -> dict[str, Any]: + """Generate metadata for a document type. + + Args: + doc_type: Document type name + llm_config: LLM configuration dictionary + reference_template: Reference metadata template + org_id: Organization ID + + Returns: + Dictionary with status and metadata + """ + prompt_tool = VibeExtractorPromptServiceHelper._get_prompt_tool(org_id) + + payload = { + "doc_type": doc_type, + "llm_config": llm_config, + "reference_template": reference_template, + } + + return prompt_tool.generate_metadata(payload=payload) + + @staticmethod + def generate_extraction_fields( + doc_type: str, + metadata_description: str, + llm_config: dict[str, Any], + org_id: str, + ) -> dict[str, Any]: + """Generate extraction fields YAML. + + Args: + doc_type: Document type name + metadata_description: Description from metadata + llm_config: LLM configuration dictionary + org_id: Organization ID + + Returns: + Dictionary with status and extraction_yaml + """ + prompt_tool = VibeExtractorPromptServiceHelper._get_prompt_tool(org_id) + + payload = { + "doc_type": doc_type, + "metadata_description": metadata_description, + "llm_config": llm_config, + } + + return prompt_tool.generate_extraction_fields(payload=payload) + + @staticmethod + def generate_page_prompts( + doc_type: str, + metadata_description: str, + llm_config: dict[str, Any], + org_id: str, + ) -> dict[str, Any]: + """Generate page extraction prompts. + + Args: + doc_type: Document type name + metadata_description: Description from metadata + llm_config: LLM configuration dictionary + org_id: Organization ID + + Returns: + Dictionary with status, system_prompt, user_prompt + """ + prompt_tool = VibeExtractorPromptServiceHelper._get_prompt_tool(org_id) + + payload = { + "doc_type": doc_type, + "metadata_description": metadata_description, + "llm_config": llm_config, + } + + return prompt_tool.generate_page_prompts(payload=payload) + + @staticmethod + def generate_scalar_prompts( + doc_type: str, + metadata_description: str, + extraction_yaml: str, + scalar_fields: list, + llm_config: dict[str, Any], + org_id: str, + ) -> dict[str, Any]: + """Generate scalar extraction prompts. + + Args: + doc_type: Document type name + metadata_description: Description from metadata + extraction_yaml: Extraction YAML string + scalar_fields: List of scalar field names + llm_config: LLM configuration dictionary + org_id: Organization ID + + Returns: + Dictionary with status, system_prompt, user_prompt + """ + prompt_tool = VibeExtractorPromptServiceHelper._get_prompt_tool(org_id) + + payload = { + "doc_type": doc_type, + "metadata_description": metadata_description, + "extraction_yaml": extraction_yaml, + "scalar_fields": scalar_fields, + "llm_config": llm_config, + } + + return prompt_tool.generate_scalar_prompts(payload=payload) + + @staticmethod + def generate_table_prompts( + doc_type: str, + metadata_description: str, + extraction_yaml: str, + list_fields: list, + llm_config: dict[str, Any], + org_id: str, + ) -> dict[str, Any]: + """Generate table extraction prompts. + + Args: + doc_type: Document type name + metadata_description: Description from metadata + extraction_yaml: Extraction YAML string + list_fields: List of list/table field names + llm_config: LLM configuration dictionary + org_id: Organization ID + + Returns: + Dictionary with status, system_prompt, user_prompt + """ + prompt_tool = VibeExtractorPromptServiceHelper._get_prompt_tool(org_id) + + payload = { + "doc_type": doc_type, + "metadata_description": metadata_description, + "extraction_yaml": extraction_yaml, + "list_fields": list_fields, + "llm_config": llm_config, + } + + return prompt_tool.generate_table_prompts(payload=payload) diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/urls.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/urls.py new file mode 100644 index 0000000000..338d083c8a --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/urls.py @@ -0,0 +1,77 @@ +from rest_framework.routers import SimpleRouter + +from prompt_studio.prompt_studio_vibe_extractor_v2.views import ( + VibeExtractorProjectView, +) + +# Create router for standard CRUD operations +router = SimpleRouter() +router.register( + r"vibe-extractor", + VibeExtractorProjectView, + basename="vibe-extractor", +) + +# Get viewset instance for custom actions +viewset = VibeExtractorProjectView.as_view + +# Explicit URL patterns for generation endpoints +generation_patterns = [ + # Generate all components at once + path( + "vibe-extractor//generate/", + viewset({"post": "generate"}), + name="vibe-extractor-generate", + ), + # Generate metadata only + path( + "vibe-extractor//generate-metadata/", + viewset({"post": "generate_metadata"}), + name="vibe-extractor-generate-metadata", + ), + # Generate extraction fields + path( + "vibe-extractor//generate-extraction-fields/", + viewset({"post": "generate_extraction_fields"}), + name="vibe-extractor-generate-extraction-fields", + ), + # Generate page extraction prompts + path( + "vibe-extractor//generate-page-prompts/", + viewset({"post": "generate_page_prompts"}), + name="vibe-extractor-generate-page-prompts", + ), + # Generate scalar extraction prompts + path( + "vibe-extractor//generate-scalar-prompts/", + viewset({"post": "generate_scalar_prompts"}), + name="vibe-extractor-generate-scalar-prompts", + ), + # Generate table extraction prompts + path( + "vibe-extractor//generate-table-prompts/", + viewset({"post": "generate_table_prompts"}), + name="vibe-extractor-generate-table-prompts", + ), + # Read generated file + path( + "vibe-extractor//read-file/", + viewset({"get": "read_file"}), + name="vibe-extractor-read-file", + ), + # List generated files + path( + "vibe-extractor//list-files/", + viewset({"get": "list_files"}), + name="vibe-extractor-list-files", + ), + # Guess document type from file + path( + "vibe-extractor/guess-document-type/", + viewset({"post": "guess_document_type"}), + name="vibe-extractor-guess-document-type", + ), +] + +# Combine router URLs with explicit generation patterns +urlpatterns = router.urls + generation_patterns diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/vibe_extractor_helper.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/vibe_extractor_helper.py new file mode 100644 index 0000000000..aca8150bbe --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/vibe_extractor_helper.py @@ -0,0 +1,355 @@ +"""Helper functions for Vibe Extractor operations.""" + +import logging +from pathlib import Path +from typing import Any + +import yaml +from django.conf import settings +from utils.file_storage.helpers.prompt_studio_file_helper import ( + PromptStudioFileHelper, +) + +from prompt_studio.prompt_studio_vibe_extractor_v2.constants import ( + VibeExtractorFileNames, + VibeExtractorPaths, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.exceptions import ( + FileReadError, + InvalidDocumentTypeError, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.models import ( + VibeExtractorProject, +) + +logger = logging.getLogger(__name__) + + +class VibeExtractorHelper: + """Helper class for Vibe Extractor operations.""" + + @staticmethod + def validate_document_type(doc_type: str) -> str: + """Validate and normalize document type name. + + Args: + doc_type: Document type name + + Returns: + Normalized document type (lowercase with hyphens) + + Raises: + InvalidDocumentTypeError: If document type is invalid + """ + if not doc_type or not doc_type.strip(): + raise InvalidDocumentTypeError("Document type cannot be empty") + + # Convert to lowercase and replace spaces with hyphens + normalized = doc_type.lower().replace(" ", "-").replace("_", "-") + + # Remove special characters except hyphens + normalized = "".join(c for c in normalized if c.isalnum() or c == "-") + + if not normalized: + raise InvalidDocumentTypeError(f"Invalid document type: {doc_type}") + + return normalized + + @staticmethod + def get_project_output_path(project: VibeExtractorProject) -> Path: + """Get the output path for a project. + + Args: + project: VibeExtractorProject instance + + Returns: + Path object for the project output directory + """ + if project.generation_output_path: + return Path(project.generation_output_path) + + # Default to staging directory + base_dir = getattr( + settings, + "VIBE_EXTRACTOR_OUTPUT_DIR", + Path(settings.BASE_DIR).parent / VibeExtractorPaths.STAGING_DIR, + ) + normalized_type = VibeExtractorHelper.validate_document_type( + project.document_type + ) + return Path(base_dir) / normalized_type + + @staticmethod + def ensure_output_directory(project: VibeExtractorProject) -> Path: + """Ensure output directory exists for a project. + + Args: + project: VibeExtractorProject instance + + Returns: + Path object for the created directory + """ + output_path = VibeExtractorHelper.get_project_output_path(project) + output_path.mkdir(parents=True, exist_ok=True) + + # Create prompts subdirectory + prompts_path = output_path / VibeExtractorPaths.PROMPTS_DIR + prompts_path.mkdir(parents=True, exist_ok=True) + + return output_path + + @staticmethod + def read_generated_file(project: VibeExtractorProject, file_type: str) -> str: + """Read a generated file for a project. + + Args: + project: VibeExtractorProject instance + file_type: Type of file to read + + Returns: + Content of the file + + Raises: + FileReadError: If file cannot be read + """ + output_path = VibeExtractorHelper.get_project_output_path(project) + + file_map = { + "metadata": output_path / VibeExtractorFileNames.METADATA_YAML, + "extraction": output_path / VibeExtractorFileNames.EXTRACTION_YAML, + "page_extraction_system": output_path + / VibeExtractorPaths.PROMPTS_DIR + / VibeExtractorFileNames.PAGE_EXTRACTION_SYSTEM_MD, + "page_extraction_user": output_path + / VibeExtractorPaths.PROMPTS_DIR + / VibeExtractorFileNames.PAGE_EXTRACTION_USER_MD, + "scalars_extraction_system": output_path + / VibeExtractorPaths.PROMPTS_DIR + / VibeExtractorFileNames.SCALARS_EXTRACTION_SYSTEM_MD, + "scalars_extraction_user": output_path + / VibeExtractorPaths.PROMPTS_DIR + / VibeExtractorFileNames.SCALARS_EXTRACTION_USER_MD, + "tables_extraction_system": output_path + / VibeExtractorPaths.PROMPTS_DIR + / VibeExtractorFileNames.TABLES_EXTRACTION_SYSTEM_MD, + "tables_extraction_user": output_path + / VibeExtractorPaths.PROMPTS_DIR + / VibeExtractorFileNames.TABLES_EXTRACTION_USER_MD, + } + + file_path = file_map.get(file_type) + if not file_path: + raise FileReadError(f"Unknown file type: {file_type}") + + if not file_path.exists(): + raise FileReadError(f"File not found: {file_path}. Generate the files first.") + + try: + with open(file_path) as f: + return f.read() + except Exception as e: + raise FileReadError(f"Error reading file {file_path}: {str(e)}") from e + + @staticmethod + def update_generation_progress( + project: VibeExtractorProject, + step: str, + status: str, + message: str | None = None, + ) -> None: + """Update generation progress for a project. + + Args: + project: VibeExtractorProject instance + step: Generation step name + status: Status of the step (pending, in_progress, completed, failed) + message: Optional message + """ + if not project.generation_progress: + project.generation_progress = {} + + project.generation_progress[step] = { + "status": status, + "message": message or "", + } + project.save(update_fields=["generation_progress", "modified_at"]) + + @staticmethod + def get_reference_template(template_name: str) -> str: + """Get reference template content. + + Args: + template_name: Name of the template file + + Returns: + Content of the reference template + + Raises: + FileReadError: If template cannot be read + """ + reference_dir = getattr( + settings, + "VIBE_EXTRACTOR_REFERENCE_DIR", + Path(settings.BASE_DIR).parent / VibeExtractorPaths.REFERENCE_DIR, + ) + template_path = Path(reference_dir) / template_name + + if not template_path.exists(): + raise FileReadError(f"Reference template not found: {template_path}") + + try: + with open(template_path) as f: + return f.read() + except Exception as e: + raise FileReadError( + f"Error reading reference template {template_path}: {str(e)}" + ) from e + + @staticmethod + def save_yaml_file(output_path: Path, filename: str, content: dict[str, Any]) -> None: + """Save content as YAML file. + + Args: + output_path: Output directory path + filename: Name of the file + content: Content to save as YAML + """ + file_path = output_path / filename + with open(file_path, "w") as f: + yaml.dump(content, f, default_flow_style=False, sort_keys=False) + + @staticmethod + def save_markdown_file(output_path: Path, filename: str, content: str) -> None: + """Save content as markdown file. + + Args: + output_path: Output directory path + filename: Name of the file + content: Content to save + """ + file_path = output_path / filename + with open(file_path, "w") as f: + f.write(content) + + @staticmethod + def guess_document_type_from_file( + file_name: str, + tool_id: str, + org_id: str, + user_id: str, + ) -> Dict[str, Any]: + """Guess document type from file content. + + This method: + 1. Constructs the file path using permanent file storage + 2. Reads the file content using dynamic_extractor + 3. Calls prompt-service to guess the document type using LLM + + Args: + file_name: Name of the file in permanent storage + tool_id: Tool ID to construct the file path + org_id: Organization ID + user_id: User ID + + Returns: + Dictionary containing: + - status: "success" or "error" + - document_type: Guessed document type (if success) + - confidence: Confidence score (if applicable) + - error: Error message (if error) + """ + try: + # Import here to avoid circular imports + from prompt_studio.prompt_profile_manager_v2.models import ProfileManager + from prompt_studio.prompt_studio_core_v2.models import CustomTool + from prompt_studio.prompt_studio_core_v2.prompt_studio_helper import ( + PromptStudioHelper, + ) + from prompt_studio.prompt_studio_vibe_extractor_v2.services.generator_service import ( + GeneratorService, + ) + + # Get the tool instance to access profile manager + tool = CustomTool.objects.get(pk=tool_id) + + # Get default profile for extraction + default_profile = ProfileManager.get_default_llm_profile(tool) + + # Construct file path using PromptStudioFileHelper + file_path = PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory( + org_id=org_id, + user_id=user_id, + tool_id=tool_id, + is_create=False, + ) + full_file_path = str(Path(file_path) / file_name) + + # Use dynamic_extractor to read and extract text from the file + from utils.file_storage.constants import FileStorageKeys + + from prompt_studio.prompt_studio_core_v2.prompt_ide_base_tool import ( + PromptIdeBaseTool, + ) + from unstract.sdk.constants import LogLevel + from unstract.sdk1.file_storage.constants import StorageType + from unstract.sdk1.file_storage.env_helper import EnvHelper + from unstract.sdk1.utils.indexing import IndexingUtils + + fs_instance = EnvHelper.get_storage( + storage_type=StorageType.PERMANENT, + env_name=FileStorageKeys.PERMANENT_REMOTE_STORAGE, + ) + util = PromptIdeBaseTool(log_level=LogLevel.INFO, org_id=org_id) + + # Generate doc_id for extraction + doc_id = IndexingUtils.generate_index_key( + vector_db=str(default_profile.vector_store.id), + embedding=str(default_profile.embedding_model.id), + x2text=str(default_profile.x2text.id), + chunk_size=str(default_profile.chunk_size), + chunk_overlap=str(default_profile.chunk_overlap), + file_path=full_file_path, + file_hash=None, + fs=fs_instance, + tool=util, + ) + + # Extract text from the file + extracted_text = PromptStudioHelper.dynamic_extractor( + profile_manager=default_profile, + file_path=full_file_path, + org_id=org_id, + document_id=None, # Not needed for this operation + run_id=None, + enable_highlight=False, + doc_id=doc_id, + ) + + if not extracted_text or not extracted_text.strip(): + return { + "status": "error", + "error": "Could not extract text from file", + } + + # Get LLM configuration from system LLM + llm_config = GeneratorService._get_llm_config() + + # Call prompt-service via SDK helper + from prompt_studio.prompt_studio_vibe_extractor_v2.services.prompt_service_helper import ( + VibeExtractorPromptServiceHelper, + ) + + result = VibeExtractorPromptServiceHelper.guess_document_type( + file_content=extracted_text, + llm_config=llm_config, + org_id=org_id, + ) + + return result + + except Exception as e: + logger.error(f"Error guessing document type: {str(e)}", exc_info=True) + return { + "status": "error", + "error": str(e), + } diff --git a/backend/prompt_studio/prompt_studio_vibe_extractor_v2/views.py b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/views.py new file mode 100644 index 0000000000..0d0b16ef61 --- /dev/null +++ b/backend/prompt_studio/prompt_studio_vibe_extractor_v2/views.py @@ -0,0 +1,543 @@ +from django.db.models import QuerySet +from rest_framework import status, viewsets +from rest_framework.decorators import action +from rest_framework.request import Request +from rest_framework.response import Response +from rest_framework.versioning import URLPathVersioning +from utils.filtering import FilterHelper + +from prompt_studio.permission import PromptAcesssToUser +from prompt_studio.prompt_studio_vibe_extractor_v2.constants import ( + VibeExtractorKeys, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.exceptions import ( + FileReadError, + InvalidDocumentTypeError, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.models import ( + VibeExtractorProject, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.serializers import ( + VibeExtractorFileReadSerializer, + VibeExtractorGenerateExtractionFieldsSerializer, + VibeExtractorGenerateMetadataSerializer, + VibeExtractorGeneratePagePromptsSerializer, + VibeExtractorGenerateScalarPromptsSerializer, + VibeExtractorGenerateTablePromptsSerializer, + VibeExtractorGuessDocumentTypeSerializer, + VibeExtractorProjectCreateSerializer, + VibeExtractorProjectSerializer, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.services.generator_service import ( + GeneratorService, +) +from prompt_studio.prompt_studio_vibe_extractor_v2.vibe_extractor_helper import ( + VibeExtractorHelper, +) + + +class VibeExtractorProjectView(viewsets.ModelViewSet): + """Viewset to handle Vibe Extractor project CRUD operations. + + Provides endpoints for: + - Creating new extraction projects + - Listing projects + - Retrieving project details + - Updating project settings + - Deleting projects + - Triggering generation + - Reading generated files + """ + + versioning_class = URLPathVersioning + serializer_class = VibeExtractorProjectSerializer + permission_classes: list[type[PromptAcesssToUser]] = [PromptAcesssToUser] + + def get_queryset(self) -> QuerySet: + """Get queryset filtered by tool_id if provided.""" + filter_args = FilterHelper.build_filter_args( + self.request, + VibeExtractorKeys.TOOL_ID, + ) + if filter_args: + queryset = VibeExtractorProject.objects.filter(**filter_args) + else: + queryset = VibeExtractorProject.objects.all() + return queryset + + def create(self, request: Request, *args, **kwargs) -> Response: + """Create a new Vibe Extractor project. + + Args: + request: HTTP request with document_type and optional tool_id + + Returns: + Response with created project data + """ + serializer = VibeExtractorProjectCreateSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + # Validate and normalize document type + document_type = VibeExtractorHelper.validate_document_type( + serializer.validated_data["document_type"] + ) + + # Create project + project = VibeExtractorProject.objects.create( + document_type=document_type, + tool_id_id=serializer.validated_data.get("tool_id"), + created_by=request.user, + modified_by=request.user, + ) + + # Create output directory + output_path = VibeExtractorHelper.ensure_output_directory(project) + project.generation_output_path = str(output_path) + project.save(update_fields=["generation_output_path"]) + + response_serializer = VibeExtractorProjectSerializer(project) + return Response(response_serializer.data, status=status.HTTP_201_CREATED) + + except InvalidDocumentTypeError as e: + return Response({"error": str(e)}, status=status.HTTP_400_BAD_REQUEST) + except Exception as e: + return Response( + {"error": f"Failed to create project: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=True, methods=["post"]) + def generate_metadata(self, request: Request, pk=None) -> Response: + """Generate only metadata for a project. + + Args: + request: HTTP request + pk: Project ID + + Returns: + Response with generated metadata + """ + serializer = VibeExtractorGenerateMetadataSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + project = self.get_object() + + # Start generation in background + import threading + + def run_generation(): + """Run metadata generation in background thread.""" + try: + GeneratorService.generate_metadata_only(project) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.error(f"Background generation failed: {e}", exc_info=True) + + thread = threading.Thread(target=run_generation) + thread.daemon = True + thread.start() + + return Response( + { + "message": "Metadata generation started", + "project_id": str(project.project_id), + "status": project.status, + }, + status=status.HTTP_202_ACCEPTED, + ) + + except VibeExtractorProject.DoesNotExist: + return Response( + {"error": "Project not found"}, + status=status.HTTP_404_NOT_FOUND, + ) + except Exception as e: + return Response( + {"error": f"Generation failed: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=True, methods=["post"]) + def generate_extraction_fields(self, request: Request, pk=None) -> Response: + """Generate extraction fields for a project. + + Args: + request: HTTP request with metadata + pk: Project ID + + Returns: + Response with generated extraction fields + """ + serializer = VibeExtractorGenerateExtractionFieldsSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + project = self.get_object() + metadata = serializer.validated_data["metadata"] + + # Start generation in background + import threading + + def run_generation(): + """Run extraction fields generation in background thread.""" + try: + GeneratorService.generate_extraction_fields_only(project, metadata) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.error(f"Background generation failed: {e}", exc_info=True) + + thread = threading.Thread(target=run_generation) + thread.daemon = True + thread.start() + + return Response( + { + "message": "Extraction fields generation started", + "project_id": str(project.project_id), + "status": project.status, + }, + status=status.HTTP_202_ACCEPTED, + ) + + except VibeExtractorProject.DoesNotExist: + return Response( + {"error": "Project not found"}, + status=status.HTTP_404_NOT_FOUND, + ) + except Exception as e: + return Response( + {"error": f"Generation failed: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=True, methods=["post"]) + def generate_page_prompts(self, request: Request, pk=None) -> Response: + """Generate page extraction prompts for a project. + + Args: + request: HTTP request with metadata + pk: Project ID + + Returns: + Response with generated prompts + """ + serializer = VibeExtractorGeneratePagePromptsSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + project = self.get_object() + metadata = serializer.validated_data["metadata"] + + # Start generation in background + import threading + + def run_generation(): + """Run page prompts generation in background thread.""" + try: + GeneratorService.generate_page_extraction_prompts(project, metadata) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.error(f"Background generation failed: {e}", exc_info=True) + + thread = threading.Thread(target=run_generation) + thread.daemon = True + thread.start() + + return Response( + { + "message": "Page prompts generation started", + "project_id": str(project.project_id), + "status": project.status, + }, + status=status.HTTP_202_ACCEPTED, + ) + + except VibeExtractorProject.DoesNotExist: + return Response( + {"error": "Project not found"}, + status=status.HTTP_404_NOT_FOUND, + ) + except Exception as e: + return Response( + {"error": f"Generation failed: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=True, methods=["post"]) + def generate_scalar_prompts(self, request: Request, pk=None) -> Response: + """Generate scalar extraction prompts for a project. + + Args: + request: HTTP request with metadata and extraction_yaml + pk: Project ID + + Returns: + Response with generated prompts + """ + serializer = VibeExtractorGenerateScalarPromptsSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + project = self.get_object() + metadata = serializer.validated_data["metadata"] + extraction_yaml = serializer.validated_data["extraction_yaml"] + + # Start generation in background + import threading + + def run_generation(): + """Run scalar prompts generation in background thread.""" + try: + GeneratorService.generate_scalar_extraction_prompts( + project, metadata, extraction_yaml + ) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.error(f"Background generation failed: {e}", exc_info=True) + + thread = threading.Thread(target=run_generation) + thread.daemon = True + thread.start() + + return Response( + { + "message": "Scalar prompts generation started", + "project_id": str(project.project_id), + "status": project.status, + }, + status=status.HTTP_202_ACCEPTED, + ) + + except VibeExtractorProject.DoesNotExist: + return Response( + {"error": "Project not found"}, + status=status.HTTP_404_NOT_FOUND, + ) + except Exception as e: + return Response( + {"error": f"Generation failed: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=True, methods=["post"]) + def generate_table_prompts(self, request: Request, pk=None) -> Response: + """Generate table extraction prompts for a project. + + Args: + request: HTTP request with metadata and extraction_yaml + pk: Project ID + + Returns: + Response with generated prompts + """ + serializer = VibeExtractorGenerateTablePromptsSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + project = self.get_object() + metadata = serializer.validated_data["metadata"] + extraction_yaml = serializer.validated_data["extraction_yaml"] + + # Start generation in background + import threading + + def run_generation(): + """Run table prompts generation in background thread.""" + try: + GeneratorService.generate_table_extraction_prompts( + project, metadata, extraction_yaml + ) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.error(f"Background generation failed: {e}", exc_info=True) + + thread = threading.Thread(target=run_generation) + thread.daemon = True + thread.start() + + return Response( + { + "message": "Table prompts generation started", + "project_id": str(project.project_id), + "status": project.status, + }, + status=status.HTTP_202_ACCEPTED, + ) + + except VibeExtractorProject.DoesNotExist: + return Response( + {"error": "Project not found"}, + status=status.HTTP_404_NOT_FOUND, + ) + except Exception as e: + return Response( + {"error": f"Generation failed: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=True, methods=["get"]) + def read_file(self, request: Request, pk=None) -> Response: + """Read a generated file for a project. + + Args: + request: HTTP request with file_type parameter + pk: Project ID + + Returns: + Response with file content + """ + file_type = request.query_params.get("file_type") + if not file_type: + return Response( + {"error": "file_type parameter is required"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + serializer = VibeExtractorFileReadSerializer(data={"file_type": file_type}) + if not serializer.is_valid(): + return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) + + try: + project = self.get_object() + content = VibeExtractorHelper.read_generated_file(project, file_type) + + return Response( + { + "file_type": file_type, + "content": content, + "project_id": str(project.project_id), + }, + status=status.HTTP_200_OK, + ) + + except VibeExtractorProject.DoesNotExist: + return Response( + {"error": "Project not found"}, + status=status.HTTP_404_NOT_FOUND, + ) + except FileReadError as e: + return Response({"error": str(e)}, status=status.HTTP_404_NOT_FOUND) + except Exception as e: + return Response( + {"error": f"Failed to read file: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=True, methods=["get"]) + def list_files(self, request: Request, pk=None) -> Response: + """List all generated files for a project. + + Args: + request: HTTP request + pk: Project ID + + Returns: + Response with list of available files + """ + try: + project = self.get_object() + output_path = VibeExtractorHelper.get_project_output_path(project) + + files = [] + file_types = [ + "metadata", + "extraction", + "page_extraction_system", + "page_extraction_user", + "scalars_extraction_system", + "scalars_extraction_user", + "tables_extraction_system", + "tables_extraction_user", + ] + + for file_type in file_types: + try: + VibeExtractorHelper.read_generated_file(project, file_type) + files.append({"file_type": file_type, "exists": True}) + except FileReadError: + files.append({"file_type": file_type, "exists": False}) + + return Response( + { + "project_id": str(project.project_id), + "files": files, + }, + status=status.HTTP_200_OK, + ) + + except VibeExtractorProject.DoesNotExist: + return Response( + {"error": "Project not found"}, + status=status.HTTP_404_NOT_FOUND, + ) + except Exception as e: + return Response( + {"error": f"Failed to list files: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + @action(detail=False, methods=["post"]) + def guess_document_type(self, request: Request) -> Response: + """Guess document type from file content. + + Args: + request: HTTP request with file_name and tool_id + + Returns: + Response with guessed document type + """ + serializer = VibeExtractorGuessDocumentTypeSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + file_name = serializer.validated_data["file_name"] + tool_id = serializer.validated_data["tool_id"] + + # Call the helper to guess document type + result = VibeExtractorHelper.guess_document_type_from_file( + file_name=file_name, + tool_id=str(tool_id), + org_id=request.user.organization_id, + user_id=request.user.user_id, + ) + + if result.get("status") == "error": + return Response( + { + "error": result.get("error"), + "raw_response": result.get("raw_response"), + "attempted_json": result.get("attempted_json"), + "partial_response": result.get("partial_response"), + }, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + return Response( + { + "document_type": result.get("document_type"), + "confidence": result.get("confidence"), + "primary_indicators": result.get("primary_indicators", []), + "document_category": result.get("document_category"), + "alternative_types": result.get("alternative_types", []), + "reasoning": result.get("reasoning"), + }, + status=status.HTTP_200_OK, + ) + + except Exception as e: + return Response( + {"error": f"Failed to guess document type: {str(e)}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) diff --git a/prompt-service/pyproject.toml b/prompt-service/pyproject.toml index 09e41a8b2b..8d08387ff8 100644 --- a/prompt-service/pyproject.toml +++ b/prompt-service/pyproject.toml @@ -19,7 +19,11 @@ dependencies = [ "redis>=5.0.3,<5.3", "unstract-core", "unstract-flags", - "unstract-sdk1[aws,gcs,azure]" + "unstract-sdk1[aws,gcs,azure]", + # Autogen packages for Vibe Extractor LLM generation + "autogen-core>=0.4.0", + "autogen-ext>=0.4.0", + "autogen-agentchat>=0.4.0", ] [tool.uv.sources] diff --git a/prompt-service/src/unstract/prompt_service/services/vibe_extractor/README.md b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/README.md new file mode 100644 index 0000000000..0794d6a9a9 --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/README.md @@ -0,0 +1,350 @@ +# Vibe Extractor Service + +The Vibe Extractor Service is an agentic system that automatically generates document extraction metadata, fields, and prompts using LLM technology. It follows the architecture and patterns from the `new_document_type_generator.py` reference implementation. + +## Overview + +This service generates all the necessary components for document extraction: +- Document metadata (metadata.yaml) +- Extraction fields (extraction.yaml) +- Page extraction prompts (system and user) +- Scalar extraction prompts (system and user) +- Table extraction prompts (system and user) + +## Architecture + +### Components + +``` +vibe_extractor/ +├── __init__.py # Package exports +├── constants.py # Bootstrap prompts and constants +├── llm_helper.py # LLM client initialization (using autogen-ext) +├── generator.py # Core generation logic +├── service.py # Service orchestration +├── api_helper.py # API integration helpers +└── README.md # This file +``` + +### LLM Adapter Pattern + +The service uses the autogen-ext library for LLM communication, making it easy to swap between different providers: + +- **OpenAI**: Standard OpenAI models +- **Azure OpenAI**: Azure-hosted OpenAI models +- **Anthropic**: Claude models +- **Bedrock**: AWS Bedrock with Claude models + +This architecture is designed to be easily replaceable with the new autogen client when it becomes available. + +## Usage + +### Basic Usage + +```python +from unstract.prompt_service.services.vibe_extractor.api_helper import ( + generate_document_extraction_components +) + +# Configure LLM +llm_config = { + "adapter_id": "anthropic", + "model": "claude-3-5-sonnet-20241022", + "api_key": "sk-ant-...", + "temperature": 0.7, + "max_tokens": 4096 +} + +# Generate all components +result = await generate_document_extraction_components( + doc_type="invoice", + output_dir="/path/to/output", + llm_config=llm_config +) + +if result["status"] == "success": + print(f"Generated files at: {result['output_path']}") + print(f"Files: {result['files']}") +else: + print(f"Error: {result['error']}") +``` + +### Backend Integration + +The backend integrates with this service through the `GeneratorService` class: + +```python +from prompt_studio.prompt_studio_vibe_extractor_v2.services.generator_service import ( + GeneratorService +) + +# Generate all components for a project +result = GeneratorService.generate_all(project) +``` + +## Configuration + +### Environment Variables + +For the backend to use this service, configure these environment variables: + +```bash +# LLM Provider Configuration +VIBE_EXTRACTOR_ADAPTER_ID=anthropic # or openai, azureopenai, bedrock +VIBE_EXTRACTOR_MODEL=claude-3-5-sonnet-20241022 +VIBE_EXTRACTOR_API_KEY=your-api-key-here +VIBE_EXTRACTOR_TEMPERATURE=0.7 +VIBE_EXTRACTOR_MAX_TOKENS=4096 + +# For Azure OpenAI +VIBE_EXTRACTOR_API_BASE=https://your-resource.openai.azure.com/ +VIBE_EXTRACTOR_API_VERSION=2024-02-15-preview +VIBE_EXTRACTOR_DEPLOYMENT=your-deployment-name + +# For AWS Bedrock +VIBE_EXTRACTOR_AWS_ACCESS_KEY_ID=your-access-key +VIBE_EXTRACTOR_AWS_SECRET_ACCESS_KEY=your-secret-key +VIBE_EXTRACTOR_REGION_NAME=us-east-1 +``` + +### Django Settings + +Alternatively, configure in Django settings.py: + +```python +VIBE_EXTRACTOR_LLM_CONFIG = { + "adapter_id": "anthropic", + "model": "claude-3-5-sonnet-20241022", + "api_key": os.environ.get("ANTHROPIC_API_KEY"), + "temperature": 0.7, + "max_tokens": 4096, +} +``` + +## API Endpoints + +### Backend API Endpoints + +#### Create Project +```http +POST /api/v1/vibe-extractor/ +Content-Type: application/json + +{ + "document_type": "invoice", + "tool_id": "optional-tool-uuid" +} +``` + +#### Generate Components +```http +POST /api/v1/vibe-extractor/{project_id}/generate/ +Content-Type: application/json + +{ + "regenerate": false +} +``` + +Response: +```json +{ + "message": "Generation started", + "project_id": "uuid", + "status": "generating_metadata" +} +``` + +#### Read Generated File +```http +GET /api/v1/vibe-extractor/{project_id}/read_file/?file_type=metadata +``` + +Response: +```json +{ + "file_type": "metadata", + "content": "...", + "project_id": "uuid" +} +``` + +Supported file types: +- `metadata`: metadata.yaml +- `extraction`: extraction.yaml +- `page_extraction_system`: Page extraction system prompt +- `page_extraction_user`: Page extraction user prompt +- `scalars_extraction_system`: Scalar extraction system prompt +- `scalars_extraction_user`: Scalar extraction user prompt +- `tables_extraction_system`: Table extraction system prompt +- `tables_extraction_user`: Table extraction user prompt + +#### List Generated Files +```http +GET /api/v1/vibe-extractor/{project_id}/list_files/ +``` + +Response: +```json +{ + "project_id": "uuid", + "files": [ + {"file_type": "metadata", "exists": true}, + {"file_type": "extraction", "exists": true}, + ... + ] +} +``` + +## Generation Steps + +The service generates components in the following sequence: + +1. **Generate Metadata** (`generating_metadata`) + - Creates metadata.yaml with document type information + - Includes name, description, tags, version, etc. + +2. **Generate Extraction Fields** (`generating_fields`) + - Creates extraction.yaml with field definitions + - Includes scalar fields and list/table fields + +3. **Generate Page Extraction Prompts** (`generating_prompts`) + - System prompt for page relevance detection + - User prompt for page analysis + +4. **Generate Scalar Extraction Prompts** + - System prompt for scalar field extraction + - User prompt for scalar extraction + +5. **Generate Table Extraction Prompts** + - System prompt for table/list extraction + - User prompt for table extraction + +Each step updates the project status and progress tracking. + +## Progress Tracking + +The service provides progress callbacks to track generation: + +```python +def progress_callback(step: str, status: str, message: str = ""): + print(f"Step: {step}, Status: {status}, Message: {message}") + +result = await service.generate_all( + doc_type="invoice", + reference_template=template, + progress_callback=progress_callback +) +``` + +## Error Handling + +The service includes comprehensive error handling: + +- Invalid LLM configuration +- API failures +- File I/O errors +- Invalid document types +- Generation failures + +All errors are logged and returned with descriptive messages. + +## Testing + +### Manual Testing + +1. Create a project: +```bash +curl -X POST http://localhost:8000/api/v1/vibe-extractor/ \ + -H "Content-Type: application/json" \ + -d '{"document_type": "invoice"}' +``` + +2. Start generation: +```bash +curl -X POST http://localhost:8000/api/v1/vibe-extractor/{project_id}/generate/ \ + -H "Content-Type: application/json" \ + -d '{}' +``` + +3. Check status: +```bash +curl http://localhost:8000/api/v1/vibe-extractor/{project_id}/ +``` + +4. Read generated files: +```bash +curl http://localhost:8000/api/v1/vibe-extractor/{project_id}/read_file/?file_type=metadata +``` + +## Future Enhancements + +### Autogen Client Migration + +The current implementation uses autogen-ext for LLM communication. When the new autogen client is ready, migration will be straightforward: + +1. Update `llm_helper.py` to use the new autogen client +2. Update `generate_with_llm()` function +3. No changes needed in `generator.py` or `service.py` + +### Celery Integration + +For production deployments, replace the threading-based background processing with Celery: + +```python +from celery import shared_task + +@shared_task +def generate_components_task(project_id): + project = VibeExtractorProject.objects.get(project_id=project_id) + return GeneratorService.generate_all(project) +``` + +### Caching + +Add caching for reference templates and frequently used prompts to improve performance. + +## Troubleshooting + +### Import Errors + +If you see import errors, ensure the prompt-service is properly installed: +```bash +cd prompt-service +pip install -e . +``` + +### LLM Configuration Errors + +Verify your LLM configuration: +```python +from unstract.prompt_service.services.vibe_extractor.api_helper import ( + validate_llm_config +) + +is_valid, error = validate_llm_config(llm_config) +if not is_valid: + print(f"Configuration error: {error}") +``` + +### Generation Failures + +Check the logs for detailed error messages: +```bash +tail -f /path/to/logs/django.log +``` + +## Code Style + +The implementation follows Unstract coding standards: +- Type hints for all function parameters and returns +- Comprehensive docstrings +- Error handling and logging +- Consistent naming conventions +- Clean separation of concerns + +## References + +- Reference Implementation: `/home/harini/Documents/Workspace/unstract-omniparse-studio/tools/new_document_type_generator.py` +- Rentroll Service (Adapter Pattern): `/home/harini/Documents/Workspace/unstract-cloud/rentroll-service/` +- Backend Models: `backend/prompt_studio/prompt_studio_vibe_extractor_v2/models.py` diff --git a/prompt-service/src/unstract/prompt_service/services/vibe_extractor/__init__.py b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/__init__.py new file mode 100644 index 0000000000..459083d182 --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/__init__.py @@ -0,0 +1,12 @@ +"""Vibe Extractor service for generating document extraction prompts.""" + +from .generator import VibeExtractorGenerator +from .llm_helper import generate_with_llm, get_llm_client +from .service import VibeExtractorService + +__all__ = [ + "VibeExtractorGenerator", + "VibeExtractorService", + "get_llm_client", + "generate_with_llm", +] diff --git a/prompt-service/src/unstract/prompt_service/services/vibe_extractor/api_helper.py b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/api_helper.py new file mode 100644 index 0000000000..0640356fd3 --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/api_helper.py @@ -0,0 +1,553 @@ +"""API Helper for Vibe Extractor. + +This module provides helper functions for backend API integration. +""" + +import asyncio +import logging +from typing import Any + +from .service import VibeExtractorService + +logger = logging.getLogger(__name__) + + +def _run_async(coro): + """Helper to run async coroutines in sync context. + + Args: + coro: Coroutine to run + + Returns: + Result of the coroutine + """ + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + # If loop is running, create a new one + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + result = loop.run_until_complete(coro) + return result + else: + return loop.run_until_complete(coro) + except RuntimeError: + # No event loop, create new one + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + result = loop.run_until_complete(coro) + return result + + +def generate_document_extraction_components_sync( + doc_type: str, + output_dir: str, + llm_config: Dict[str, Any], + reference_template: Optional[str] = None, + progress_callback: Optional[callable] = None, +) -> Dict[str, Any]: + """Generate all document extraction components (sync version). + + This is the main entry point for backend API to trigger generation. + + Args: + doc_type: Document type name (e.g., "invoice", "receipt") + output_dir: Base output directory for generated files + llm_config: LLM configuration dictionary + reference_template: Optional reference metadata.yaml template content + progress_callback: Optional callback function(step, status, message) + + Returns: + Dictionary containing generation result + """ + return _run_async( + generate_document_extraction_components_async( + doc_type, output_dir, llm_config, reference_template, progress_callback + ) + ) + + +async def generate_document_extraction_components_async( + doc_type: str, + output_dir: str, + llm_config: Dict[str, Any], + reference_template: Optional[str] = None, + progress_callback: Optional[callable] = None, +) -> Dict[str, Any]: + """Generate all document extraction components. + + This is the main entry point for backend API to trigger generation. + + Args: + doc_type: Document type name (e.g., "invoice", "receipt") + output_dir: Base output directory for generated files + llm_config: LLM configuration dictionary containing: + - adapter_id: Provider (openai, anthropic, bedrock, azureopenai) + - model: Model name + - api_key: API key + - temperature: Temperature (default: 0.7) + - max_tokens: Max tokens (default: 4096) + reference_template: Optional reference metadata.yaml template content. + If not provided, a default template will be used. + progress_callback: Optional callback function(step, status, message) + to report generation progress + + Returns: + Dictionary containing: + - status: "success" or "error" + - output_path: Path to generated files + - files: Dictionary of generated file paths + - error: Error message if status is "error" + + Example: + ```python + llm_config = { + "adapter_id": "anthropic", + "model": "claude-3-5-sonnet-20241022", + "api_key": "sk-ant-...", + "temperature": 0.7, + "max_tokens": 4096, + } + + result = await generate_document_extraction_components( + doc_type="invoice", output_dir="/path/to/output", llm_config=llm_config + ) + + if result["status"] == "success": + print(f"Generated files at: {result['output_path']}") + print(f"Files: {result['files']}") + else: + print(f"Error: {result['error']}") + ``` + """ + try: + # Use default reference template if not provided + if reference_template is None: + reference_template = _get_default_reference_template() + + # Initialize service + service = VibeExtractorService(llm_config, output_dir) + + # Generate all components + result = await service.generate_all( + doc_type, reference_template, progress_callback + ) + + return result + + except Exception as e: + error_msg = f"Error in generate_document_extraction_components: {str(e)}" + logger.error(error_msg, exc_info=True) + return {"status": "error", "error": error_msg} + + +def generate_metadata_only_sync( + doc_type: str, + llm_config: Dict[str, Any], + reference_template: Optional[str] = None, +) -> Dict[str, Any]: + """Generate only metadata for a document type (sync version). + + Args: + doc_type: Document type name + llm_config: LLM configuration dictionary + reference_template: Optional reference template + + Returns: + Dictionary containing generated metadata or error + """ + return _run_async( + generate_metadata_only_async(doc_type, llm_config, reference_template) + ) + + +async def generate_metadata_only_async( + doc_type: str, + llm_config: Dict[str, Any], + reference_template: Optional[str] = None, +) -> Dict[str, Any]: + """Generate only metadata for a document type. + + Args: + doc_type: Document type name + llm_config: LLM configuration dictionary + reference_template: Optional reference template + + Returns: + Dictionary containing generated metadata or error + """ + try: + if reference_template is None: + reference_template = _get_default_reference_template() + + # Initialize service with temporary output dir + service = VibeExtractorService(llm_config, "/tmp/vibe_extractor") + + result = await service.generate_metadata_only(doc_type, reference_template) + return result + + except Exception as e: + error_msg = f"Error generating metadata: {str(e)}" + logger.error(error_msg) + return {"status": "error", "error": error_msg} + + +def generate_extraction_fields_only_sync( + doc_type: str, + metadata: Dict[str, Any], + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate only extraction fields for a document type (sync version). + + Args: + doc_type: Document type name + metadata: Metadata dictionary + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing extraction YAML or error + """ + return _run_async( + generate_extraction_fields_only_async(doc_type, metadata, llm_config) + ) + + +async def generate_extraction_fields_only_async( + doc_type: str, + metadata: Dict[str, Any], + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate only extraction fields for a document type. + + Args: + doc_type: Document type name + metadata: Metadata dictionary + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing extraction YAML or error + """ + try: + # Initialize service with temporary output dir + service = VibeExtractorService(llm_config, "/tmp/vibe_extractor") + + result = await service.generate_extraction_fields_only(doc_type, metadata) + return result + + except Exception as e: + error_msg = f"Error generating extraction fields: {str(e)}" + logger.error(error_msg) + return {"status": "error", "error": error_msg} + + +def _get_default_reference_template() -> str: + """Get default reference metadata.yaml template. + + Returns: + Default reference template as string + """ + return """--- +name_identifier: example # Unique identifier +name: Example Document # Human-readable name +description: | # Description of the document type + Example document description. + This should be 3-4 sentences explaining what this document type is. +description_seo: | # SEO optimized description + SEO optimized description for example document. +html_meta_description: | # HTML meta description + HTML meta description for example document. +tags: # List of tags + - example + - document + - sample +version: 1.0.0 # Version +status: beta # Current status +visibility: public # Visibility +author: Zipstack Inc # Author +release_date: 2025-07-01 # Release date +price_multiplier: 1.0 # Price multiplier +llm_model: claude-sonnet-1-7 # LLM model +extraction_features: # Extraction features + locate_pages: true + rolling_window: false + challenge: false +""" + + +def generate_page_extraction_prompts_sync( + doc_type: str, + metadata: Dict[str, Any], + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate page extraction prompts (sync version). + + Args: + doc_type: Document type name + metadata: Metadata dictionary + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing system and user prompts or error + """ + return _run_async( + generate_page_extraction_prompts_async(doc_type, metadata, llm_config) + ) + + +async def generate_page_extraction_prompts_async( + doc_type: str, + metadata: Dict[str, Any], + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate page extraction prompts (system and user). + + Args: + doc_type: Document type name + metadata: Metadata dictionary + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing system and user prompts or error + """ + try: + # Initialize service with temporary output dir + service = VibeExtractorService(llm_config, "/tmp/vibe_extractor") + + # Generate both prompts + page_system_prompt = ( + await service.generator.generate_page_extraction_system_prompt( + doc_type, metadata + ) + ) + page_user_prompt = await service.generator.generate_page_extraction_user_prompt( + doc_type, metadata + ) + + return { + "status": "success", + "system_prompt": page_system_prompt, + "user_prompt": page_user_prompt, + } + + except Exception as e: + error_msg = f"Error generating page extraction prompts: {str(e)}" + logger.error(error_msg) + return {"status": "error", "error": error_msg} + + +def generate_scalar_extraction_prompts_sync( + doc_type: str, + metadata: Dict[str, Any], + extraction_yaml: str, + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate scalar extraction prompts (sync version). + + Args: + doc_type: Document type name + metadata: Metadata dictionary + extraction_yaml: Extraction YAML content + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing system and user prompts or error + """ + return _run_async( + generate_scalar_extraction_prompts_async( + doc_type, metadata, extraction_yaml, llm_config + ) + ) + + +async def generate_scalar_extraction_prompts_async( + doc_type: str, + metadata: Dict[str, Any], + extraction_yaml: str, + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate scalar extraction prompts (system and user). + + Args: + doc_type: Document type name + metadata: Metadata dictionary + extraction_yaml: Extraction YAML content + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing system and user prompts or error + """ + try: + # Initialize service with temporary output dir + service = VibeExtractorService(llm_config, "/tmp/vibe_extractor") + + # Generate both prompts + scalar_system_prompt = ( + await service.generator.generate_scalar_extraction_system_prompt( + doc_type, metadata, extraction_yaml + ) + ) + scalar_user_prompt = ( + await service.generator.generate_scalar_extraction_user_prompt( + doc_type, metadata + ) + ) + + return { + "status": "success", + "system_prompt": scalar_system_prompt, + "user_prompt": scalar_user_prompt, + } + + except Exception as e: + error_msg = f"Error generating scalar extraction prompts: {str(e)}" + logger.error(error_msg) + return {"status": "error", "error": error_msg} + + +def generate_table_extraction_prompts_sync( + doc_type: str, + metadata: Dict[str, Any], + extraction_yaml: str, + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate table extraction prompts (sync version). + + Args: + doc_type: Document type name + metadata: Metadata dictionary + extraction_yaml: Extraction YAML content + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing system and user prompts or error + """ + return _run_async( + generate_table_extraction_prompts_async( + doc_type, metadata, extraction_yaml, llm_config + ) + ) + + +async def generate_table_extraction_prompts_async( + doc_type: str, + metadata: Dict[str, Any], + extraction_yaml: str, + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Generate table extraction prompts (system and user). + + Args: + doc_type: Document type name + metadata: Metadata dictionary + extraction_yaml: Extraction YAML content + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing system and user prompts or error + """ + try: + # Initialize service with temporary output dir + service = VibeExtractorService(llm_config, "/tmp/vibe_extractor") + + # Generate both prompts + table_system_prompt = ( + await service.generator.generate_table_extraction_system_prompt( + doc_type, metadata, extraction_yaml + ) + ) + table_user_prompt = await service.generator.generate_table_extraction_user_prompt( + doc_type, metadata + ) + + return { + "status": "success", + "system_prompt": table_system_prompt, + "user_prompt": table_user_prompt, + } + + except Exception as e: + error_msg = f"Error generating table extraction prompts: {str(e)}" + logger.error(error_msg) + return {"status": "error", "error": error_msg} + + +def validate_llm_config(llm_config: Dict[str, Any]) -> tuple[bool, Optional[str]]: + """Validate LLM configuration. + + Args: + llm_config: LLM configuration dictionary + + Returns: + Tuple of (is_valid, error_message) + """ + required_fields = ["adapter_id", "model", "api_key"] + + for field in required_fields: + if field not in llm_config: + return False, f"Missing required field: {field}" + + valid_adapters = ["openai", "azureopenai", "anthropic", "bedrock"] + if llm_config["adapter_id"] not in valid_adapters: + return ( + False, + f"Invalid adapter_id: {llm_config['adapter_id']}. " + f"Must be one of: {', '.join(valid_adapters)}", + ) + + return True, None + + +def guess_document_type_sync( + file_content: str, + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Guess document type from file content (sync version). + + Args: + file_content: Extracted text content from the document + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing guessed document type or error + """ + return _run_async(guess_document_type_async(file_content, llm_config)) + + +async def guess_document_type_async( + file_content: str, + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Guess document type from file content using LLM. + + Args: + file_content: Extracted text content from the document + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing: + - status: "success" or "error" + - document_type: Guessed document type (if success) + - confidence: Confidence description (if applicable) + - error: Error message (if error) + """ + try: + # Validate LLM config + is_valid, error_msg = validate_llm_config(llm_config) + if not is_valid: + return {"status": "error", "error": error_msg} + + # Import LLM helper + from .llm_helper import guess_document_type_with_llm + + # Call LLM helper to guess document type + result = await guess_document_type_with_llm( + file_content=file_content, + llm_config=llm_config, + ) + + return result + + except Exception as e: + error_msg = f"Error guessing document type: {str(e)}" + logger.error(error_msg, exc_info=True) + return {"status": "error", "error": error_msg} diff --git a/prompt-service/src/unstract/prompt_service/services/vibe_extractor/constants.py b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/constants.py new file mode 100644 index 0000000000..27aa8f3118 --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/constants.py @@ -0,0 +1,396 @@ +"""Constants for Vibe Extractor generation service.""" + + +class VibeExtractorBootstrapPrompts: + """Bootstrap prompts for generating document extraction components.""" + + DOCUMENT_METADATA = """Generate metadata for a document type called "{doc_type}". +Based on your knowledge of this document type, provide all the fields shown in the reference template below. +Focus on generating appropriate values for: +1. name_identifier (lowercase, hyphens instead of spaces) +2. name (human-readable name) +3. description (3-4 sentences explaining what this document type is) +4. description_seo (SEO-optimized version of description) +5. html_meta_description (HTML meta description) +6. tags (3-6 relevant tags) +7. status (typically "beta" for new document types) +8. visibility (typically "public") +IMPORTANT: For multiline text fields (description, description_seo, html_meta_description), use the YAML pipe syntax (|) to properly format multiline content. For example: +description: | + This is a multiline description + that spans multiple lines + and maintains proper formatting. +Use the reference template structure but adapt the content for "{doc_type}": +{reference_template} +Return your response as a YAML structure matching the exact format above, but with content appropriate for "{doc_type}". +Make sure to use the pipe syntax (|) for all description fields. +Only return the YAML structure, no additional text.""" + + DOCUMENT_EXTRACTION_FIELDS = """Generate an extraction.yaml structure for document type: "{doc_type}". +Document description: {metadata_description} +Create a YAML structure that defines the fields to extract from this document type. +Follow these IMPORTANT rules: +1. Include all relevant fields that would typically be found in a {doc_type} +2. Use descriptive field names with underscores (e.g., invoice_number, customer_name) +3. Add comments after each field using # to describe what it extracts +4. **CRITICAL**: List type fields should and can ONLY be one level deep +5. Use List ONLY for items that are actual lists in the document (e.g., line_items, taxes, discounts) +6. Do NOT generate nested items or objects in the extraction YAML file +7. The extraction YAML should ONLY contain: + - Scalar items: items with single values (e.g., invoice_number, date, total_amount, vendor_name) + - List type items: items that are lists/arrays (e.g., line_items, taxes, discounts) +8. List items should be one level deep with sub-fields, but no nested objects +9. Include both scalar fields and list fields where appropriate for {doc_type} documents +Example format: +# Scalar items (single values) +field_name: # Description of field +another_scalar: # Description of another scalar field +# List items (one level deep only) +list_field: # Description of list + - sub_field: # Description + another_field: # Description +IMPORTANT: Do NOT create nested objects or multi-level lists. Keep it simple: +- Scalar items for single values +- List items for arrays/lists (one level deep only) +Generate a comprehensive extraction structure for {doc_type} documents. +Return ONLY the YAML structure, no additional text.""" + + PAGE_EXTRACTION_SYSTEM = """Generate a system prompt for page extraction for document type: "{doc_type}". +Document description: {metadata_description} +Context: Some documents may have many pages of irrelevant data. The LLM needs to identify +pages that contain relevant data for this document type. +The LLM will be given a page of the document (including bottom half of previous page and +top half of next page for context). The LLM must decide whether the page contains relevant +data and respond with only "yes" or "no". +Generate a system prompt that: +1. Explains what this document type is +2. Describes what relevant data looks like for this document type +3. Lists what irrelevant data might be present +4. Provides clear instructions to respond only with "yes" or "no" +5. Gives examples of what to look for +Make the prompt comprehensive but concise. Focus on the specific characteristics of {doc_type} documents.""" + + PAGE_EXTRACTION_USER = """Generate a user prompt for page extraction for document type: "{doc_type}". +Document description: {metadata_description} +Context: This is the user prompt that will be sent along with the system prompt. The user +will provide a page of the document (including bottom half of previous page and top half +of next page for context). The LLM must decide whether the page contains relevant data +and respond with only "yes" or "no". +Generate a concise user prompt that: +1. Asks the LLM to analyze the provided page +2. Reminds the LLM to look for relevant {doc_type} data +3. Instructs to respond with only "yes" or "no" +Keep it short and direct - this will be used as a template for each page analysis.""" + + SCALARS_EXTRACTION_SYSTEM = """Generate a system prompt for scalar field extraction for document type: "{doc_type}". +Document description: {metadata_description} +Context: The LLM needs to extract scalar values from the document. Each line in the document +is numbered in hexadecimal format (0x0001, 0x0002, etc.). The LLM must extract values and +their line numbers. +The prompt must: +1. Have dedicated section with exact format: + ## Extraction Items + ```yaml + {{{{extraction_items}}}} + ``` +2. Use the handlebars variable only once in the prompt, refer to "## Extraction Items" section elsewhere +3. Have a section called "## Expected Variations of requested to available items" that lists possible variations of the scalar items based on the document type +4. Instruct to extract ONLY from the provided document (no prior knowledge) +5. Require ALL fields in output (use null if not found) +6. Include line numbers for each extracted value (format: _line_number_fieldname) +7. Output ONLY YAML format, no other text +8. Handle {doc_type}-specific extraction challenges +9. **CRITICAL**: Emphasize that the LLM must NOT perform any arithmetic operations, calculations, or other operations on values. Extract values exactly as they appear in the document. If a calculated field is required but not present in the document, it should be set to null. +Example output format (showing extracted values, not field names): +field_name: "extracted value from document" +_line_number_field_name: 0x0002 +missing_field: null +_line_number_missing_field: null +Example scalar fields: {scalar_fields} +Generate a comprehensive system prompt for scalar extraction with: +1. Dedicated section using exact format: + ## Extraction Items + ```yaml + {{{{extraction_items}}}} + ``` +2. Expected Variations section with {doc_type}-specific field variations""" + + SCALARS_EXTRACTION_USER = """Generate a concise user prompt for scalar field extraction for document type: "{doc_type}". +The user prompt should be very simple and direct. It should: +1. Ask the LLM to extract the specified fields from the document +2. Remind to follow the system instructions for format and line numbers +3. Be very brief - just 1-2 sentences +4. Not repeat detailed instructions (those are in the system prompt) +The prompt should be something like: +"Extract the specified fields from this {doc_type} document following the format requirements." +Generate a very concise user prompt.""" + + TABLES_EXTRACTION_SYSTEM = """Generate a system prompt for table/list extraction for document type: "{doc_type}". +Document description: {metadata_description} +Context: The LLM needs to extract table/list data in TSV format. Tables can span multiple pages, +have multi-line cells, and sometimes what appears to be a table is actually a simple list. +The prompt must: +1. Have dedicated section with exact format: + ## Extraction Items + ```yaml + {{{{extraction_items}}}} + ``` +2. Use the handlebars variable only once in the prompt, refer to "## Extraction Items" section elsewhere +3. Have a section called "## Expected Variations of requested to available items" that lists possible variations of the table items based on the document type +4. Handle rolling window documents (partial pages) +5. Handle tables spanning multiple pages with headers/footers +6. Handle multi-line cell content +7. Distinguish between tables and simple lists +8. Extract ONLY from provided document (no prior knowledge) +9. Include line numbers for each row (format: _line_no column) +10. Output TSV format with headers +11. Handle {doc_type}-specific table structures +12. **CRITICAL**: Emphasize that the LLM must NOT perform any arithmetic operations, calculations, or other operations on values. Extract values exactly as they appear in the document. If a calculated field is required but not present in the document, it should be set to null. +13. If the table is not present in the document, return an empty TSV file with header only +14. Output ONLY TSV format with no explanations, commentary, or other text +Include these specific examples in the prompt (use \\t to represent tabs in examples): +TYPE 1 - Normal tables example: +Document: +``` +0x0001: +0x0002: No Description Unit Discount +0x0004: Cost +0x0005: 1 Item 1 100.00 10.00 +0x0006: 2 Item 2 200.00 20.00 +0x0007: 3 Item 3 300.00 30.00 +``` +Note: "Unit Cost" spans two lines. +Output should be: +```tsv +_line_no\\tline_item_no\\tdescription\\tunit_cost\\tdiscount_percentage +0x0005\\t1\\tItem 1\\t100.00\\t10.00 +0x0006\\t2\\tItem 2\\t200.00\\t20.00 +0x0007\\t3\\tItem 3\\t300.00\\t30.00 +``` +TYPE 2 - Simple list example: +Document: +``` +0x0001: +0x0002: Special instructions: +0x0003: • Item 1 +0x0004: • Item 2 +0x0005: • Item 3 +``` +Output should be: +```tsv +_line_no\\titem +0x0003\\tItem 1 +0x0004\\tItem 2 +0x0005\\tItem 3 +``` +Generate a comprehensive system prompt for table/list extraction with: +1. Dedicated section using exact format: + ## Extraction Items + ```yaml + {{{{extraction_items}}}} + ``` +2. Expected Variations section with {doc_type}-specific field variations +3. Include these examples with \\t notation""" + + TABLES_EXTRACTION_USER = """Generate a concise user prompt for table extraction for document type: "{doc_type}". +The user prompt should be very simple and direct. It should: +1. Ask the LLM to extract the specified table/list from the document +2. Remind to follow the system instructions for TSV format +3. Be very brief - just 1-2 sentences +4. Not repeat detailed instructions (those are in the system prompt) +The prompt should be something like: +"Extract the table if it is present. If there is no matching table, reply No table found." +Generate a very concise user prompt.""" + + DOCUMENT_TYPE_IDENTIFICATION = """You are an expert document analyzer. Your task is to identify the type of document based on its content. + +Analyze the provided document content carefully and identify its type with high accuracy. + +## Document Analysis Guidelines + +When analyzing the document, look for these key indicators: + +### 1. Structural Elements +- Headers, footers, and watermarks +- Document layout and formatting +- Presence of logos or official seals +- Table structures and data organization +- Section headings and labels + +### 2. Content Markers +- Specific terminology and jargon +- Date formats and references +- Monetary values and calculations +- Legal or regulatory language +- Contact information and addresses + +### 3. Functional Purpose +- What is the primary purpose of this document? +- Who are the typical stakeholders (issuer, recipient)? +- What transaction or process does it document? +- What obligations or information does it convey? + +## Common Document Types + +Consider these common business document categories: + +**Financial Documents:** +- Invoice: Itemized bill for goods/services with payment terms, invoice number, vendor details +- Receipt: Proof of payment showing transaction details, payment method, timestamp +- Purchase Order: Request to purchase goods/services with PO number, quantities, pricing +- Credit Note: Document issued for refunds or corrections to invoices +- Debit Note: Document for additional charges or corrections +- Bill of Lading: Shipping document detailing goods being transported +- Packing Slip: List of items included in a shipment +- Delivery Note: Confirmation of goods delivered +- Statement of Account: Summary of transactions over a period +- Payment Voucher: Authorization for payment + +**Banking Documents:** +- Bank Statement: Record of account transactions over a period +- Check/Cheque: Payment instrument drawn on a bank account +- Deposit Slip: Record of funds deposited into account +- Wire Transfer: Electronic fund transfer documentation +- Letter of Credit: Bank guarantee for international trade + +**Payroll & Employment:** +- Pay Stub/Payslip: Earnings statement showing salary breakdown +- W-2 Form: Annual wage and tax statement (US) +- Employment Contract: Agreement between employer and employee +- Offer Letter: Job offer with terms and conditions +- Timesheet: Record of hours worked + +**Tax & Compliance:** +- Tax Form (W-9, 1099, 1040, etc.): Various tax-related forms +- Tax Invoice: Invoice showing tax breakdown (VAT, GST, sales tax) +- Tax Return: Annual tax filing document +- Customs Declaration: Import/export declaration + +**Healthcare:** +- Medical Record: Patient medical history and treatment notes +- Prescription: Medication authorization from healthcare provider +- Lab Report: Medical test results and findings +- Insurance Claim: Request for insurance coverage/reimbursement +- EOB (Explanation of Benefits): Insurance payment explanation +- Medical Bill/Invoice: Healthcare services billing + +**Legal Documents:** +- Contract/Agreement: Legal binding agreement between parties +- NDA (Non-Disclosure Agreement): Confidentiality agreement +- Power of Attorney: Legal authorization document +- Certificate (Birth, Death, Marriage, etc.): Official certification +- License/Permit: Official authorization or permission +- Lease Agreement: Property rental contract +- Deed: Property ownership transfer document + +**Shipping & Logistics:** +- Shipping Label: Package destination and tracking information +- Air Waybill: Air cargo shipping document +- Commercial Invoice: International trade invoice +- Certificate of Origin: Document certifying product origin +- Customs Invoice: Invoice for customs clearance + +**HR & Administrative:** +- Application Form: Form for requesting service or admission +- Resume/CV: Career and qualifications summary +- Reference Letter: Professional or character reference +- Resignation Letter: Notice of employment termination +- Performance Review: Employee evaluation document + +**Correspondence:** +- Business Letter: Formal business correspondence +- Memo: Internal communication document +- Notice: Formal announcement or notification +- Minutes of Meeting: Record of meeting proceedings + +**Reports:** +- Business Report: Analysis or summary of business matters +- Financial Report: Financial performance analysis +- Audit Report: Financial or operational audit findings +- Technical Report: Technical analysis or specifications +- Research Report: Research findings and analysis + +**Other:** +- Warranty: Product or service guarantee +- Manual: Instruction or user guide +- Catalog: Product or service listings +- Brochure: Marketing or information material +- Quote/Estimate: Price proposal for goods/services +- RFP (Request for Proposal): Solicitation for vendor proposals +- Inventory List: Stock or asset listing + +## Response Format + +After analyzing the document, respond with **ONLY** a valid JSON object in this exact format: + +```json +{{ + "document_type": "identified-document-type", + "confidence": "high|medium|low", + "primary_indicators": [ + "specific indicator 1 that led to identification", + "specific indicator 2 that led to identification", + "specific indicator 3 that led to identification" + ], + "document_category": "category of the document", + "alternative_types": [ + "possible alternative document type if confidence is not high" + ], + "reasoning": "brief explanation of why this document type was identified" +}} +``` + +### Field Specifications: + +1. **document_type**: Use lowercase with hyphens (e.g., "invoice", "purchase-order", "medical-record", "bank-statement") + - Be specific: Use "tax-invoice" instead of just "invoice" if tax details are prominent + - Use compound names when necessary: "proof-of-delivery", "certificate-of-origin" + +2. **confidence**: + - "high": Multiple clear indicators, structure matches perfectly + - "medium": Good indicators but some ambiguity or missing elements + - "low": Limited indicators, could be multiple types + +3. **primary_indicators**: List 3-5 specific elements from the document that led to identification + - Example: "Invoice number INV-2024-001", "Payment terms: Net 30", "Itemized line items with tax" + +4. **document_category**: High-level category + - Examples: "financial", "legal", "healthcare", "shipping", "employment", "tax" + +5. **alternative_types**: If confidence is not high, list 1-2 possible alternatives + - Leave empty array if confidence is high + +6. **reasoning**: Brief 1-2 sentence explanation + - Focus on why this type fits best + - Mention key distinguishing features + +## Important Instructions + +1. **Extract, Don't Assume**: Base your analysis solely on the provided content +2. **Be Specific**: Choose the most specific document type (e.g., "proforma-invoice" vs "invoice") +3. **Consider Context**: Look at terminology, structure, and purpose together +4. **Regional Variations**: Consider that document types may have regional names (e.g., "invoice" vs "bill") +5. **JSON Only**: Return ONLY the JSON object, no additional text, explanations, or markdown formatting +6. **Handle Uncertainty**: If truly uncertain, use "medium" or "low" confidence and provide alternatives +7. **Standardize Names**: Use common, standardized document type names in lowercase-with-hyphens format + +## Example Response + +```json +{{ + "document_type": "purchase-order", + "confidence": "high", + "primary_indicators": [ + "PO Number: PO-2024-001234", + "Vendor details with 'SHIP TO' and 'BILL TO' sections", + "Line items with quantities and unit prices", + "Terms and conditions section", + "Signature block for approval" + ], + "document_category": "financial", + "alternative_types": [], + "reasoning": "Document contains all standard purchase order elements including PO number, vendor/buyer information, itemized products with quantities and prices, and approval signatures. The presence of delivery instructions and payment terms confirms this is a purchase order rather than an invoice or quote." +}} +``` + +Now analyze the document content provided and respond with your identification in the exact JSON format specified above.""" diff --git a/prompt-service/src/unstract/prompt_service/services/vibe_extractor/generator.py b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/generator.py new file mode 100644 index 0000000000..a62f6dc06b --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/generator.py @@ -0,0 +1,458 @@ +"""Vibe Extractor Generator. + +This module generates document extraction metadata, fields, and prompts +using LLM-based agents, similar to the new_document_type_generator.py reference. +""" + +import logging +from pathlib import Path +from typing import Any + +import yaml + +from .constants import VibeExtractorBootstrapPrompts +from .llm_helper import generate_with_llm, get_llm_client + +logger = logging.getLogger(__name__) + + +class VibeExtractorGenerator: + """Generator for document extraction components using LLM.""" + + def __init__(self, llm_config: dict[str, Any]): + """Initialize the generator with LLM configuration. + + Args: + llm_config: Configuration dictionary for LLM client + - adapter_id: Provider (openai, anthropic, bedrock, etc.) + - model: Model name + - api_key: API key + - temperature: Temperature (default: 0.7) + - max_tokens: Max tokens (default: 4096) + """ + self.llm_config = llm_config + self.llm_client = None + + def _ensure_llm_client(self): + """Ensure LLM client is initialized.""" + if self.llm_client is None: + self.llm_client = get_llm_client(self.llm_config) + + def _clean_llm_response(self, response_text: str) -> str: + """Remove code block markers from LLM response. + + Args: + response_text: Raw response from LLM + + Returns: + Cleaned response text + """ + response_text = response_text.strip() + + # Remove markdown code blocks + if response_text.startswith("```markdown"): + response_text = response_text[11:] + elif response_text.startswith("```yaml"): + response_text = response_text[7:] + elif response_text.startswith("```"): + response_text = response_text[3:] + + if response_text.endswith("```"): + response_text = response_text[:-3] + + return response_text.strip() + + async def generate_metadata( + self, doc_type: str, reference_template: str + ) -> dict[str, Any]: + """Generate metadata for a document type using LLM. + + Args: + doc_type: Document type name (e.g., "invoice", "receipt") + reference_template: Reference metadata.yaml template content + + Returns: + Dictionary containing generated metadata + + Raises: + Exception: If metadata generation fails + """ + self._ensure_llm_client() + logger.info(f"Generating metadata for '{doc_type}' using LLM...") + + prompt = VibeExtractorBootstrapPrompts.DOCUMENT_METADATA.format( + doc_type=doc_type, reference_template=reference_template + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=1000) + + # Clean and parse YAML response + yaml_content = self._clean_llm_response(response) + metadata = yaml.safe_load(yaml_content) + + logger.info(f"Successfully generated metadata for '{doc_type}'") + return metadata + + except Exception as e: + error_msg = f"Error generating metadata: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + async def generate_extraction_fields( + self, doc_type: str, metadata: dict[str, Any] + ) -> str: + """Generate extraction.yaml structure using LLM. + + Args: + doc_type: Document type name + metadata: Generated metadata dictionary + + Returns: + YAML string defining extraction fields + + Raises: + Exception: If extraction fields generation fails + """ + self._ensure_llm_client() + logger.info(f"Generating extraction fields for '{doc_type}' using LLM...") + + metadata_description = metadata.get("description", "") + prompt = VibeExtractorBootstrapPrompts.DOCUMENT_EXTRACTION_FIELDS.format( + doc_type=doc_type, metadata_description=metadata_description + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=2000) + + # Clean YAML response + yaml_content = self._clean_llm_response(response) + + logger.info(f"Successfully generated extraction fields for '{doc_type}'") + return yaml_content + + except Exception as e: + error_msg = f"Error generating extraction fields: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + async def generate_page_extraction_system_prompt( + self, doc_type: str, metadata: dict[str, Any] + ) -> str: + """Generate page extraction system prompt using LLM. + + Args: + doc_type: Document type name + metadata: Generated metadata dictionary + + Returns: + System prompt text for page extraction + + Raises: + Exception: If prompt generation fails + """ + self._ensure_llm_client() + logger.info( + f"Generating page extraction system prompt for '{doc_type}' using LLM..." + ) + + metadata_description = metadata.get("description", "") + prompt = VibeExtractorBootstrapPrompts.PAGE_EXTRACTION_SYSTEM.format( + doc_type=doc_type, metadata_description=metadata_description + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=1500) + + cleaned_response = self._clean_llm_response(response) + logger.info( + f"Successfully generated page extraction system prompt for '{doc_type}'" + ) + return cleaned_response + + except Exception as e: + error_msg = f"Error generating page extraction system prompt: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + async def generate_page_extraction_user_prompt( + self, doc_type: str, metadata: dict[str, Any] + ) -> str: + """Generate page extraction user prompt using LLM. + + Args: + doc_type: Document type name + metadata: Generated metadata dictionary + + Returns: + User prompt text for page extraction + + Raises: + Exception: If prompt generation fails + """ + self._ensure_llm_client() + logger.info( + f"Generating page extraction user prompt for '{doc_type}' using LLM..." + ) + + metadata_description = metadata.get("description", "") + prompt = VibeExtractorBootstrapPrompts.PAGE_EXTRACTION_USER.format( + doc_type=doc_type, metadata_description=metadata_description + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=500) + + cleaned_response = self._clean_llm_response(response) + logger.info( + f"Successfully generated page extraction user prompt for '{doc_type}'" + ) + return cleaned_response + + except Exception as e: + error_msg = f"Error generating page extraction user prompt: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + async def generate_scalar_extraction_system_prompt( + self, doc_type: str, metadata: dict[str, Any], extraction_yaml: str + ) -> str: + """Generate scalar extraction system prompt using LLM. + + Args: + doc_type: Document type name + metadata: Generated metadata dictionary + extraction_yaml: Generated extraction YAML content + + Returns: + System prompt text for scalar extraction + + Raises: + Exception: If prompt generation fails + """ + self._ensure_llm_client() + logger.info( + f"Generating scalar extraction system prompt for '{doc_type}' using LLM..." + ) + + # Parse extraction YAML to get scalar fields + try: + extraction_data = yaml.safe_load(extraction_yaml) + scalar_fields = [] + for key, value in extraction_data.items(): + if not isinstance(value, list): + scalar_fields.append(key) + except Exception: + scalar_fields = [] + + metadata_description = metadata.get("description", "") + scalar_fields_str = ", ".join(scalar_fields[:5]) + + prompt = VibeExtractorBootstrapPrompts.SCALARS_EXTRACTION_SYSTEM.format( + doc_type=doc_type, + metadata_description=metadata_description, + scalar_fields=scalar_fields_str, + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=1500) + + cleaned_response = self._clean_llm_response(response) + logger.info( + f"Successfully generated scalar extraction system prompt for '{doc_type}'" + ) + return cleaned_response + + except Exception as e: + error_msg = f"Error generating scalar extraction system prompt: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + async def generate_scalar_extraction_user_prompt( + self, doc_type: str, metadata: dict[str, Any] + ) -> str: + """Generate scalar extraction user prompt using LLM. + + Args: + doc_type: Document type name + metadata: Generated metadata dictionary + + Returns: + User prompt text for scalar extraction + + Raises: + Exception: If prompt generation fails + """ + self._ensure_llm_client() + logger.info( + f"Generating scalar extraction user prompt for '{doc_type}' using LLM..." + ) + + prompt = VibeExtractorBootstrapPrompts.SCALARS_EXTRACTION_USER.format( + doc_type=doc_type + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=500) + + cleaned_response = self._clean_llm_response(response) + logger.info( + f"Successfully generated scalar extraction user prompt for '{doc_type}'" + ) + return cleaned_response + + except Exception as e: + error_msg = f"Error generating scalar extraction user prompt: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + async def generate_table_extraction_system_prompt( + self, doc_type: str, metadata: dict[str, Any], extraction_yaml: str + ) -> str: + """Generate table extraction system prompt using LLM. + + Args: + doc_type: Document type name + metadata: Generated metadata dictionary + extraction_yaml: Generated extraction YAML content + + Returns: + System prompt text for table extraction + + Raises: + Exception: If prompt generation fails + """ + self._ensure_llm_client() + logger.info( + f"Generating table extraction system prompt for '{doc_type}' using LLM..." + ) + + metadata_description = metadata.get("description", "") + prompt = VibeExtractorBootstrapPrompts.TABLES_EXTRACTION_SYSTEM.format( + doc_type=doc_type, metadata_description=metadata_description + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=2000) + + cleaned_response = self._clean_llm_response(response) + logger.info( + f"Successfully generated table extraction system prompt for '{doc_type}'" + ) + return cleaned_response + + except Exception as e: + error_msg = f"Error generating table extraction system prompt: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + async def generate_table_extraction_user_prompt( + self, doc_type: str, metadata: dict[str, Any] + ) -> str: + """Generate table extraction user prompt using LLM. + + Args: + doc_type: Document type name + metadata: Generated metadata dictionary + + Returns: + User prompt text for table extraction + + Raises: + Exception: If prompt generation fails + """ + self._ensure_llm_client() + logger.info( + f"Generating table extraction user prompt for '{doc_type}' using LLM..." + ) + + prompt = VibeExtractorBootstrapPrompts.TABLES_EXTRACTION_USER.format( + doc_type=doc_type + ) + + try: + response = await generate_with_llm(self.llm_client, prompt, max_tokens=500) + + cleaned_response = self._clean_llm_response(response) + logger.info( + f"Successfully generated table extraction user prompt for '{doc_type}'" + ) + return cleaned_response + + except Exception as e: + error_msg = f"Error generating table extraction user prompt: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + def save_metadata_yaml(self, output_path: Path, metadata: dict[str, Any]) -> Path: + """Save metadata as YAML file. + + Args: + output_path: Output directory path + metadata: Metadata dictionary to save + + Returns: + Path to saved metadata.yaml file + """ + # Add default values if not present + if "version" not in metadata: + metadata["version"] = "1.0.0" + if "author" not in metadata: + metadata["author"] = "Zipstack Inc" + if "release_date" not in metadata: + metadata["release_date"] = "2025-07-01" + if "price_multiplier" not in metadata: + metadata["price_multiplier"] = 1.0 + if "llm_model" not in metadata: + metadata["llm_model"] = "claude-sonnet-1-7" + if "extraction_features" not in metadata: + metadata["extraction_features"] = { + "locate_pages": True, + "rolling_window": False, + "challenge": False, + } + + metadata_file = output_path / "metadata.yaml" + with open(metadata_file, "w") as f: + yaml.dump(metadata, f, default_flow_style=False, sort_keys=False) + + logger.info(f"Saved metadata to {metadata_file}") + return metadata_file + + def save_extraction_yaml(self, output_path: Path, extraction_content: str) -> Path: + """Save extraction fields as YAML file. + + Args: + output_path: Output directory path + extraction_content: Extraction YAML content string + + Returns: + Path to saved extraction.yaml file + """ + extraction_file = output_path / "extraction.yaml" + with open(extraction_file, "w") as f: + f.write("---\n") + f.write(extraction_content) + if not extraction_content.endswith("\n"): + f.write("\n") + + logger.info(f"Saved extraction fields to {extraction_file}") + return extraction_file + + def save_prompt_file(self, output_path: Path, filename: str, content: str) -> Path: + """Save prompt content to markdown file. + + Args: + output_path: Output directory path (should include prompts subdir) + filename: Name of the markdown file + content: Prompt content + + Returns: + Path to saved prompt file + """ + prompt_file = output_path / filename + with open(prompt_file, "w") as f: + f.write(content) + + logger.info(f"Saved prompt to {prompt_file}") + return prompt_file diff --git a/prompt-service/src/unstract/prompt_service/services/vibe_extractor/llm_helper.py b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/llm_helper.py new file mode 100644 index 0000000000..6602a3e020 --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/llm_helper.py @@ -0,0 +1,461 @@ +"""LLM Helper for Vibe Extractor. + +This module provides LLM client initialization and communication using autogen. +Uses autogen-ext clients where available, and creates compatible adapters for others. +""" + +import logging +from typing import Any, Dict, List, Optional, Sequence + +from autogen_core.models import ( + ChatCompletionClient, + LLMMessage, + SystemMessage, + UserMessage, +) +from autogen_ext.models.openai import ( + AzureOpenAIChatCompletionClient, + OpenAIChatCompletionClient, +) + +# Import SDKs (available through llama-index dependencies) +try: + import anthropic +except ImportError: + anthropic = None + +try: + import boto3 +except ImportError: + boto3 = None + +logger = logging.getLogger(__name__) + + +# ============================================================================ +# TEMPORARY TESTING METHOD - REMOVE AFTER TESTING +# ============================================================================ +def get_test_llm_config() -> Dict[str, Any]: + """Get hardcoded LLM config for testing purposes. + + TODO: REMOVE THIS AFTER TESTING - Use proper adapter configuration instead. + + This bypasses the platform settings and adapter infrastructure for quick testing. + To use, set environment variable ANTHROPIC_API_KEY. + + Returns: + Dict with hardcoded Anthropic configuration + """ + import os + + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + raise ValueError( + "ANTHROPIC_API_KEY environment variable required for testing. " + "Set it in your .env file or environment." + ) + + return { + "adapter_id": "anthropic", + "model": "claude-3-5-sonnet-20241022", + "api_key": api_key, + "temperature": 0.1, + "max_tokens": 4096, + } +# ============================================================================ +# END TEMPORARY TESTING METHOD +# ============================================================================ + + +class AnthropicAdapter(ChatCompletionClient): + """Adapter to make Anthropic SDK compatible with autogen's ChatCompletionClient interface.""" + + def __init__(self, api_key: str, model: str, temperature: float = 0.1, + max_tokens: int = 4096, **kwargs): + if anthropic is None: + raise ImportError("anthropic package is required") + + self._client = anthropic.Anthropic(api_key=api_key) + self._model = model + self._temperature = temperature + self._max_tokens = max_tokens + + async def create( + self, + messages: Sequence[LLMMessage], + *, + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + **kwargs: Any, + ) -> Any: + """Create a chat completion using Anthropic API.""" + # Convert autogen messages to Anthropic format + anthropic_messages = [] + for msg in messages: + if isinstance(msg, UserMessage): + anthropic_messages.append({"role": "user", "content": msg.content}) + elif isinstance(msg, SystemMessage): + # Anthropic handles system messages differently + # We'll prepend it to the first user message + pass + + try: + response = self._client.messages.create( + model=self._model, + max_tokens=max_tokens or self._max_tokens, + temperature=temperature or self._temperature, + messages=anthropic_messages, + ) + + # Return in a format compatible with autogen + class CompletionResult: + def __init__(self, text): + self.content = text + self.choices = [type('obj', (object,), {'message': type('obj', (object,), {'content': text})()})] + + return CompletionResult(response.content[0].text) + + except Exception as e: + logger.error(f"Anthropic API error: {str(e)}") + raise + + +class BedrockAdapter(ChatCompletionClient): + """Adapter to make AWS Bedrock compatible with autogen's ChatCompletionClient interface.""" + + def __init__(self, aws_access_key_id: str, aws_secret_access_key: str, + region_name: str, model: str, temperature: float = 0.1, + max_tokens: int = 4096, **kwargs): + if boto3 is None: + raise ImportError("boto3 is required for Bedrock") +u + session = boto3.Session( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=region_name, + ) + + # Validate credentials + try: + session.get_credentials().get_frozen_credentials() + except Exception as e: + raise RuntimeError("Invalid AWS credentials") from e + + self._client = session.client('bedrock-runtime', region_name=region_name) + self._model = model + self._temperature = temperature + self._max_tokens = max_tokens + + async def create( + self, + messages: Sequence[LLMMessage], + *, + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + **kwargs: Any, + ) -> Any: + """Create a chat completion using Bedrock API.""" + import json + + # Convert autogen messages to Bedrock format + bedrock_messages = [] + for msg in messages: + if isinstance(msg, UserMessage): + bedrock_messages.append({"role": "user", "content": msg.content}) + + try: + body = json.dumps({ + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": max_tokens or self._max_tokens, + "temperature": temperature or self._temperature, + "messages": bedrock_messages, + }) + + response = self._client.invoke_model( + modelId=self._model, + body=body + ) + + response_body = json.loads(response['body'].read()) + text = response_body['content'][0]['text'] + + # Return in a format compatible with autogen + class CompletionResult: + def __init__(self, text): + self.content = text + self.choices = [type('obj', (object,), {'message': type('obj', (object,), {'content': text})()})] + + return CompletionResult(text) + + except Exception as e: + logger.error(f"Bedrock API error: {str(e)}") + raise + + +def get_llm_client(llm_config: Dict[str, Any]) -> ChatCompletionClient: + """Initialize and return an LLM client based on configuration. + + Args: + llm_config: Configuration dictionary containing: + - adapter_id: Provider identifier (openai, azureopenai, anthropic, bedrock) + - model: Model name + - api_key: API key for the provider + - temperature: Temperature for generation (default: 0.1) + - max_tokens: Maximum tokens to generate (default: 4096) + - Other provider-specific parameters + + Returns: + ChatCompletionClient instance + + Raises: + Exception: If client initialization fails + """ + try: + adapter_id = llm_config.get("adapter_id") + + if adapter_id == "azureopenai": + return AzureOpenAIChatCompletionClient( + model=llm_config.get("model"), + azure_endpoint=llm_config.get("api_base"), + temperature=llm_config.get("temperature", 0.1), + max_tokens=llm_config.get("max_tokens", 4096), + api_version=llm_config.get("api_version"), + api_key=llm_config.get("api_key"), + azure_deployment=llm_config.get("deployment"), + timeout=llm_config.get("timeout", 900), + ) + + elif adapter_id == "openai": + return OpenAIChatCompletionClient( + model=llm_config.get("model"), + api_key=llm_config.get("api_key"), + temperature=llm_config.get("temperature", 0.1), + max_tokens=llm_config.get("max_tokens", 4096), + request_timeout=llm_config.get("request_timeout", 60), + base_url=llm_config.get("api_base"), + max_retries=llm_config.get("max_retries", 3), + timeout=llm_config.get("timeout", 900), + ) + + elif adapter_id == "anthropic": + return AnthropicAdapter( + api_key=llm_config.get("api_key"), + model=llm_config.get("model"), + temperature=llm_config.get("temperature", 0.1), + max_tokens=llm_config.get("max_tokens", 4096), + ) + + elif adapter_id == "bedrock": + return BedrockAdapter( + aws_access_key_id=llm_config.get("aws_access_key_id"), + aws_secret_access_key=llm_config.get("aws_secret_access_key"), + region_name=llm_config.get("region_name"), + model=llm_config.get("model"), + temperature=llm_config.get("temperature", 0.1), + max_tokens=llm_config.get("max_tokens", 4096), + ) + + else: + raise ValueError( + f"Unknown adapter_id: {adapter_id}. " + f"Supported: openai, azureopenai, anthropic, bedrock" + ) + + except Exception as e: + error_msg = f"Failed to initialize LLM client: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + +async def generate_with_llm( + llm_client: ChatCompletionClient, prompt: str, max_tokens: int = 2000 +) -> str: + """Generate a response using autogen's completion interface. + + Args: + llm_client: ChatCompletionClient instance (from get_llm_client) + prompt: The prompt to send to the LLM + max_tokens: Maximum tokens to generate + + Returns: + Generated text response + + Raises: + Exception: If generation fails + """ + try: + # Create messages in autogen format + messages = [ + SystemMessage(content="You are a helpful assistant that generates document extraction metadata and prompts."), + UserMessage(content=prompt, source="user"), + ] + + # Use autogen's completion API + response = await llm_client.create( + messages=messages, + max_tokens=max_tokens, + ) + + # Extract text from response + if hasattr(response, 'content'): + return response.content.strip() + elif hasattr(response, 'choices') and len(response.choices) > 0: + return response.choices[0].message.content.strip() + else: + raise ValueError("Unexpected response format from LLM") + + except Exception as e: + error_msg = f"Failed to generate with LLM: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) from e + + +async def guess_document_type_with_llm( + file_content: str, + llm_config: Dict[str, Any], +) -> Dict[str, Any]: + """Guess document type from file content using LLM. + + Args: + file_content: Extracted text content from the document + llm_config: LLM configuration dictionary + + Returns: + Dictionary containing: + - status: "success" or "error" + - document_type: Guessed document type (if success) + - confidence: Confidence description (if applicable) + - primary_indicators: List of indicators found + - document_category: Document category + - alternative_types: List of alternative types + - reasoning: Reasoning for the identification + - error: Error message (if error) + """ + try: + from .constants import VibeExtractorBootstrapPrompts + import json + import re + from json_repair import repair_json + + # Truncate content if too long (keep first 4000 characters) + content_sample = ( + file_content[:4000] if len(file_content) > 4000 else file_content + ) + + # Create the full prompt using the constant + full_prompt = f"""{VibeExtractorBootstrapPrompts.DOCUMENT_TYPE_IDENTIFICATION} + +## Document Content to Analyze + +``` +{content_sample} +``` + +Analyze the document content above and respond with your identification in the exact JSON format specified.""" + + # Get LLM client + llm_client = get_llm_client(llm_config) + + # Generate response with higher token limit for detailed analysis + response_text = await generate_with_llm( + llm_client=llm_client, + prompt=full_prompt, + max_tokens=1000, + ) + + # Try to extract JSON from response (in case LLM added markdown) + json_match = re.search( + r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL + ) + if json_match: + json_str = json_match.group(1) + else: + # Try to find JSON object directly + json_match = re.search(r'\{.*\}', response_text, re.DOTALL) + if json_match: + json_str = json_match.group(0) + else: + # No JSON found in response + logger.error( + f"No JSON object found in LLM response: {response_text}" + ) + return { + "status": "error", + "error": "LLM did not return a valid JSON response. " + "Please try again or check the LLM configuration.", + "raw_response": response_text[:500], + } + + # Try to parse JSON + response_json = None + try: + response_json = json.loads(json_str) + except json.JSONDecodeError as json_error: + # Try to repair the JSON + logger.warning( + f"Initial JSON parsing failed: {json_error}. " + f"Attempting to repair JSON..." + ) + try: + repaired_json_str = repair_json(json_str) + response_json = json.loads(repaired_json_str) + logger.info("Successfully repaired and parsed JSON response") + except Exception as repair_error: + # JSON repair also failed + logger.error( + f"Failed to repair JSON. " + f"Original error: {json_error}. " + f"Repair error: {repair_error}. " + f"Attempted to parse: {json_str[:200]}" + ) + return { + "status": "error", + "error": ( + f"Failed to parse LLM response as JSON. " + f"Original error: {str(json_error)}. " + f"JSON repair also failed: {str(repair_error)}" + ), + "raw_response": response_text[:500], + "attempted_json": json_str[:200], + } + + # Validate required fields + required_fields = ["document_type", "confidence", "reasoning"] + missing_fields = [ + field for field in required_fields if field not in response_json + ] + + if missing_fields: + logger.warning( + f"LLM response missing required fields: {missing_fields}. " + f"Response: {response_json}" + ) + return { + "status": "error", + "error": ( + f"LLM response missing required fields: " + f"{', '.join(missing_fields)}" + ), + "partial_response": response_json, + } + + # Successfully parsed and validated + return { + "status": "success", + "document_type": response_json.get("document_type", "unknown"), + "confidence": response_json.get("confidence", "unknown"), + "primary_indicators": response_json.get("primary_indicators", []), + "document_category": response_json.get( + "document_category", "unknown" + ), + "alternative_types": response_json.get("alternative_types", []), + "reasoning": response_json.get("reasoning", ""), + } + + except Exception as e: + error_msg = f"Failed to guess document type with LLM: {str(e)}" + logger.error(error_msg, exc_info=True) + return { + "status": "error", + "error": error_msg, + } diff --git a/prompt-service/src/unstract/prompt_service/services/vibe_extractor/service.py b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/service.py new file mode 100644 index 0000000000..9a823c74c1 --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/services/vibe_extractor/service.py @@ -0,0 +1,238 @@ +"""Vibe Extractor Service. + +This module provides the main service interface for generating +document extraction components. It orchestrates the complete +generation flow. +""" + +import logging +from pathlib import Path +from typing import Any + +from .generator import VibeExtractorGenerator + +logger = logging.getLogger(__name__) + + +class VibeExtractorService: + """Service for generating document extraction components.""" + + def __init__(self, llm_config: dict[str, Any], output_dir: str): + """Initialize the service. + + Args: + llm_config: LLM configuration dictionary + output_dir: Base output directory for generated files + """ + self.generator = VibeExtractorGenerator(llm_config) + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + async def generate_all( + self, + doc_type: str, + reference_template: str, + progress_callback: callable | None = None, + ) -> dict[str, Any]: + """Generate all components for a document type. + + Args: + doc_type: Document type name (e.g., "invoice", "receipt") + reference_template: Reference metadata.yaml template content + progress_callback: Optional callback to report progress + + Returns: + Dictionary containing: + - status: "success" or "error" + - output_path: Path to generated files + - files: Dictionary of generated file paths + - error: Error message if status is "error" + """ + try: + logger.info(f"Starting generation for document type: {doc_type}") + + # Create output directory for this document type + doc_output_dir = self.output_dir / doc_type + doc_output_dir.mkdir(parents=True, exist_ok=True) + + # Create prompts subdirectory + prompts_dir = doc_output_dir / "prompts" + prompts_dir.mkdir(parents=True, exist_ok=True) + + result = { + "status": "success", + "output_path": str(doc_output_dir), + "files": {}, + } + + # Step 1: Generate metadata + if progress_callback: + progress_callback("generating_metadata", "in_progress") + + logger.info("Step 1/6: Generating metadata...") + metadata = await self.generator.generate_metadata( + doc_type, reference_template + ) + metadata_file = self.generator.save_metadata_yaml(doc_output_dir, metadata) + result["files"]["metadata"] = str(metadata_file) + + if progress_callback: + progress_callback("generating_metadata", "completed") + + # Step 2: Generate extraction fields + if progress_callback: + progress_callback("generating_extraction_fields", "in_progress") + + logger.info("Step 2/6: Generating extraction fields...") + extraction_yaml = await self.generator.generate_extraction_fields( + doc_type, metadata + ) + extraction_file = self.generator.save_extraction_yaml( + doc_output_dir, extraction_yaml + ) + result["files"]["extraction"] = str(extraction_file) + + if progress_callback: + progress_callback("generating_extraction_fields", "completed") + + # Step 3: Generate page extraction prompts + if progress_callback: + progress_callback("generating_page_prompts", "in_progress") + + logger.info("Step 3/6: Generating page extraction prompts...") + page_system_prompt = ( + await self.generator.generate_page_extraction_system_prompt( + doc_type, metadata + ) + ) + page_system_file = self.generator.save_prompt_file( + prompts_dir, "page-extraction-system.md", page_system_prompt + ) + result["files"]["page_extraction_system"] = str(page_system_file) + + page_user_prompt = await self.generator.generate_page_extraction_user_prompt( + doc_type, metadata + ) + page_user_file = self.generator.save_prompt_file( + prompts_dir, "page-extraction-user.md", page_user_prompt + ) + result["files"]["page_extraction_user"] = str(page_user_file) + + if progress_callback: + progress_callback("generating_page_prompts", "completed") + + # Step 4: Generate scalar extraction prompts + if progress_callback: + progress_callback("generating_scalar_prompts", "in_progress") + + logger.info("Step 4/6: Generating scalar extraction prompts...") + scalar_system_prompt = ( + await self.generator.generate_scalar_extraction_system_prompt( + doc_type, metadata, extraction_yaml + ) + ) + scalar_system_file = self.generator.save_prompt_file( + prompts_dir, "extraction-scalars-system.md", scalar_system_prompt + ) + result["files"]["scalars_extraction_system"] = str(scalar_system_file) + + scalar_user_prompt = ( + await self.generator.generate_scalar_extraction_user_prompt( + doc_type, metadata + ) + ) + scalar_user_file = self.generator.save_prompt_file( + prompts_dir, "extraction-scalars-user.md", scalar_user_prompt + ) + result["files"]["scalars_extraction_user"] = str(scalar_user_file) + + if progress_callback: + progress_callback("generating_scalar_prompts", "completed") + + # Step 5: Generate table extraction prompts + if progress_callback: + progress_callback("generating_table_prompts", "in_progress") + + logger.info("Step 5/6: Generating table extraction prompts...") + table_system_prompt = ( + await self.generator.generate_table_extraction_system_prompt( + doc_type, metadata, extraction_yaml + ) + ) + table_system_file = self.generator.save_prompt_file( + prompts_dir, "extraction-table-system.md", table_system_prompt + ) + result["files"]["tables_extraction_system"] = str(table_system_file) + + table_user_prompt = ( + await self.generator.generate_table_extraction_user_prompt( + doc_type, metadata + ) + ) + table_user_file = self.generator.save_prompt_file( + prompts_dir, "extraction-table-user.md", table_user_prompt + ) + result["files"]["tables_extraction_user"] = str(table_user_file) + + if progress_callback: + progress_callback("generating_table_prompts", "completed") + + logger.info( + f"Successfully generated all components for '{doc_type}' at {doc_output_dir}" + ) + return result + + except Exception as e: + error_msg = f"Error during generation: {str(e)}" + logger.error(error_msg, exc_info=True) + + if progress_callback: + progress_callback("error", "failed", error_msg) + + return {"status": "error", "error": error_msg} + + async def generate_metadata_only( + self, doc_type: str, reference_template: str + ) -> dict[str, Any]: + """Generate only metadata for a document type. + + Args: + doc_type: Document type name + reference_template: Reference metadata.yaml template + + Returns: + Dictionary containing generated metadata or error + """ + try: + logger.info(f"Generating metadata for: {doc_type}") + metadata = await self.generator.generate_metadata( + doc_type, reference_template + ) + return {"status": "success", "metadata": metadata} + except Exception as e: + error_msg = f"Error generating metadata: {str(e)}" + logger.error(error_msg) + return {"status": "error", "error": error_msg} + + async def generate_extraction_fields_only( + self, doc_type: str, metadata: dict[str, Any] + ) -> dict[str, Any]: + """Generate only extraction fields for a document type. + + Args: + doc_type: Document type name + metadata: Metadata dictionary + + Returns: + Dictionary containing extraction YAML or error + """ + try: + logger.info(f"Generating extraction fields for: {doc_type}") + extraction_yaml = await self.generator.generate_extraction_fields( + doc_type, metadata + ) + return {"status": "success", "extraction_yaml": extraction_yaml} + except Exception as e: + error_msg = f"Error generating extraction fields: {str(e)}" + logger.error(error_msg) + return {"status": "error", "error": error_msg} diff --git a/prompt-service/uv.lock b/prompt-service/uv.lock index 542e3db393..ad109c594b 100644 --- a/prompt-service/uv.lock +++ b/prompt-service/uv.lock @@ -238,6 +238,47 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/aa/91355b5f539caf1b94f0e66ff1e4ee39373b757fce08204981f7829ede51/authlib-1.6.4-py2.py3-none-any.whl", hash = "sha256:39313d2a2caac3ecf6d8f95fbebdfd30ae6ea6ae6a6db794d976405fdd9aa796", size = 243076 }, ] +[[package]] +name = "autogen-agentchat" +version = "0.4.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "autogen-core" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/50/065e357b08e4594ec949343e8e5b74ecca557e480c3072d4555569f3b517/autogen_agentchat-0.4.4.tar.gz", hash = "sha256:bb4a636707a5fd91950685b68f28019bdc9f64a101cd87109715dd212d295106", size = 58173 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/fd/7fade5f943a8c5094130f9aefd72a18818d1862d85078ec0d9ac62b0b51a/autogen_agentchat-0.4.4-py3-none-any.whl", hash = "sha256:c10e6e5a867403b8cc37c9f733e6f8a9b4f32e399808bdb1720bc00274b5e516", size = 63016 }, +] + +[[package]] +name = "autogen-core" +version = "0.4.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jsonref" }, + { name = "opentelemetry-api" }, + { name = "pillow" }, + { name = "protobuf" }, + { name = "pydantic" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1a/f5/de81486709bf04d89e4824a4b755e66c53d6d572e802514731f23314f319/autogen_core-0.4.4.tar.gz", hash = "sha256:053a17bbf7fb345bbe1249f0f7181f6a6b15f2dfa47b8ce69910b8001ff96156", size = 2310980 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/2f/f8dcd48ccb99f92252987d20c86e51e58eaed6bcfb1cf727b9bdfa51b1bc/autogen_core-0.4.4-py3-none-any.whl", hash = "sha256:2e891b20817b90e847c0f580d9ccd1cb1f682bdb2830d3b7d9425bd6966f6d28", size = 78245 }, +] + +[[package]] +name = "autogen-ext" +version = "0.4.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "autogen-core" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/10/4a/9ff4b65c773cc086f2b81cbff44161ff7f976fa235a2ec63470869cd1526/autogen_ext-0.4.4.tar.gz", hash = "sha256:116ece3a75af48f194da3a482b7f546e252d398dfaa133c62145d24a88fb8ac0", size = 140011 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/b4/cda27ed34a8be22d9c082f1b1b41ca5f4695b6b6462419ffffe50b23f086/autogen_ext-0.4.4-py3-none-any.whl", hash = "sha256:f994006ee34473524cd3ff9017c558d55a6610013d1e5d3a8f7ac90e15fb12df", size = 143012 }, +] + [[package]] name = "azure-core" version = "1.35.1" @@ -1328,6 +1369,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/c7/dd23f764de95771300a8a7ae17293c47ba0e48f826154229d18ecfe147cd/json_repair-0.42.0-py3-none-any.whl", hash = "sha256:7b6805162053dfe65722e961bc51b5eecec0582ec8a8e0fd218d33e8de757daf", size = 21612 }, ] +[[package]] +name = "jsonref" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425 }, +] + [[package]] name = "jsonschema" version = "4.25.1" @@ -3152,6 +3202,9 @@ name = "unstract-prompt-service" version = "0.0.1" source = { editable = "." } dependencies = [ + { name = "autogen-agentchat" }, + { name = "autogen-core" }, + { name = "autogen-ext" }, { name = "flask" }, { name = "json-repair" }, { name = "llama-index" }, @@ -3186,6 +3239,9 @@ test = [ [package.metadata] requires-dist = [ + { name = "autogen-agentchat", specifier = ">=0.4.0" }, + { name = "autogen-core", specifier = ">=0.4.0" }, + { name = "autogen-ext", specifier = ">=0.4.0" }, { name = "flask", specifier = "~=3.0" }, { name = "json-repair", specifier = "~=0.42.0" }, { name = "llama-index", specifier = "==0.13.2" }, diff --git a/unstract/sdk1/src/unstract/sdk1/prompt.py b/unstract/sdk1/src/unstract/sdk1/prompt.py index 85e87a527e..67a2e192f1 100644 --- a/unstract/sdk1/src/unstract/sdk1/prompt.py +++ b/unstract/sdk1/src/unstract/sdk1/prompt.py @@ -171,6 +171,158 @@ def summarize( headers=headers, ) + @log_elapsed(operation="VIBE_EXTRACTOR_GUESS_DOCUMENT_TYPE") + @handle_service_exceptions("guessing document type") + def guess_document_type( + self, + payload: dict[str, Any], + params: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + ) -> dict[str, Any]: + """Guess document type from file content using LLM. + + Args: + payload: Dictionary with file_content and llm_config + params: Optional query parameters + headers: Optional request headers + + Returns: + dict: Response with document_type, confidence, and metadata + """ + return self._call_service( + url_path="vibe-extractor/guess-document-type", + payload=payload, + params=params, + headers=headers, + ) + + @log_elapsed(operation="VIBE_EXTRACTOR_GENERATE_METADATA") + @handle_service_exceptions("generating metadata") + def generate_metadata( + self, + payload: dict[str, Any], + params: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + ) -> dict[str, Any]: + """Generate metadata for a document type. + + Args: + payload: Dictionary with doc_type, llm_config, reference_template + params: Optional query parameters + headers: Optional request headers + + Returns: + dict: Response with generated metadata + """ + return self._call_service( + url_path="vibe-extractor/generate-metadata", + payload=payload, + params=params, + headers=headers, + ) + + @log_elapsed(operation="VIBE_EXTRACTOR_GENERATE_EXTRACTION_FIELDS") + @handle_service_exceptions("generating extraction fields") + def generate_extraction_fields( + self, + payload: dict[str, Any], + params: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + ) -> dict[str, Any]: + """Generate extraction fields YAML for a document type. + + Args: + payload: Dictionary with doc_type, metadata_description, llm_config + params: Optional query parameters + headers: Optional request headers + + Returns: + dict: Response with extraction_yaml string + """ + return self._call_service( + url_path="vibe-extractor/generate-extraction-fields", + payload=payload, + params=params, + headers=headers, + ) + + @log_elapsed(operation="VIBE_EXTRACTOR_GENERATE_PAGE_PROMPTS") + @handle_service_exceptions("generating page prompts") + def generate_page_prompts( + self, + payload: dict[str, Any], + params: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + ) -> dict[str, Any]: + """Generate page extraction prompts for a document type. + + Args: + payload: Dictionary with doc_type, metadata_description, llm_config + params: Optional query parameters + headers: Optional request headers + + Returns: + dict: Response with system_prompt and user_prompt + """ + return self._call_service( + url_path="vibe-extractor/generate-page-prompts", + payload=payload, + params=params, + headers=headers, + ) + + @log_elapsed(operation="VIBE_EXTRACTOR_GENERATE_SCALAR_PROMPTS") + @handle_service_exceptions("generating scalar prompts") + def generate_scalar_prompts( + self, + payload: dict[str, Any], + params: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + ) -> dict[str, Any]: + """Generate scalar extraction prompts for a document type. + + Args: + payload: Dictionary with doc_type, metadata_description, + extraction_yaml, scalar_fields, llm_config + params: Optional query parameters + headers: Optional request headers + + Returns: + dict: Response with system_prompt and user_prompt + """ + return self._call_service( + url_path="vibe-extractor/generate-scalar-prompts", + payload=payload, + params=params, + headers=headers, + ) + + @log_elapsed(operation="VIBE_EXTRACTOR_GENERATE_TABLE_PROMPTS") + @handle_service_exceptions("generating table prompts") + def generate_table_prompts( + self, + payload: dict[str, Any], + params: dict[str, str] | None = None, + headers: dict[str, str] | None = None, + ) -> dict[str, Any]: + """Generate table extraction prompts for a document type. + + Args: + payload: Dictionary with doc_type, metadata_description, + extraction_yaml, list_fields, llm_config + params: Optional query parameters + headers: Optional request headers + + Returns: + dict: Response with system_prompt and user_prompt + """ + return self._call_service( + url_path="vibe-extractor/generate-table-prompts", + payload=payload, + params=params, + headers=headers, + ) + def _get_headers(self, headers: dict[str, str] | None = None) -> dict[str, str]: """Get default headers for requests.