1- # Example Configuration for MCP-Based Category Classifier
1+ # Example Configuration for MCP-Based Category Classifier (HTTP Transport)
22#
33# This configuration demonstrates how to use an external MCP (Model Context Protocol)
4- # service for category classification instead of the built-in Candle/ModernBERT models.
4+ # service via HTTP for category classification instead of the built-in Candle/ModernBERT models.
55#
66# Use cases:
7- # - Offload classification to a remote service
7+ # - Offload classification to a remote HTTP service
88# - Use custom classification models not supported in-tree
99# - Scale classification independently from the router
10- # - Integrate with existing ML infrastructure
10+ # - Integrate with existing ML infrastructure via REST API
11+ #
12+ # Note: This example uses HTTP transport. The MCP server should expose an HTTP endpoint
13+ # that implements the MCP protocol (e.g., http://localhost:8080/mcp)
1114
1215# BERT model for semantic caching and tool selection
1316bert_model :
14- model_id : " models /all-MiniLM-L6-v2"
17+ model_id : " sentence-transformers /all-MiniLM-L6-v2"
1518 threshold : 0.85
1619 use_cpu : true
1720
@@ -20,156 +23,85 @@ classifier:
2023 # Disable in-tree category classifier (leave model_id empty)
2124 category_model :
2225 model_id : " " # Empty = disabled
23- threshold : 0.6
24- use_cpu : true
25- use_modernbert : false
26- category_mapping_path : " "
2726
28- # Enable MCP-based category classifier
27+ # Enable MCP-based category classifier (HTTP transport only)
2928 mcp_category_model :
3029 enabled : true # Enable MCP classifier
31- transport_type : " stdio" # "stdio" or "http"
32-
33- # For stdio transport: run a local Python MCP server
34- command : " python"
35- args : ["-m", "mcp_category_classifier"]
36- env :
37- PYTHONPATH : " /opt/ml/models"
38- MODEL_PATH : " /opt/ml/models/category_classifier"
39- LOG_LEVEL : " INFO"
30+ transport_type : " http" # HTTP transport
31+ url : " http://localhost:8090/mcp" # MCP server endpoint
4032
41- # For http transport: use this instead
42- # transport_type: "http"
43- # url: "http://localhost:8080/mcp"
44-
45- tool_name : " classify_text" # MCP tool name to call
33+ tool_name : " classify_text" # MCP tool name to call
4634 threshold : 0.6 # Confidence threshold
4735 timeout_seconds : 30 # Request timeout
4836
49- # PII model configuration (unchanged)
50- pii_model :
51- model_id : " models/pii_classifier"
52- threshold : 0.7
53- use_cpu : true
54- pii_mapping_path : " models/pii_classifier/pii_type_mapping.json"
55-
56- # Prompt guard configuration (unchanged)
57- prompt_guard :
58- enabled : true
59- model_id : " models/jailbreak_classifier"
60- threshold : 0.8
61- use_cpu : true
62- use_modernbert : true
63- jailbreak_mapping_path : " models/jailbreak_classifier/jailbreak_mapping.json"
64-
6537# Categories for routing queries
66- categories :
67- - name : " math"
68- description : " Mathematical problems, equations, calculus, algebra, statistics"
69- model_scores :
70- - model : " deepseek/deepseek-r1:70b"
71- score : 0.95
72- use_reasoning : true
73- - model : " qwen/qwen3-235b"
74- score : 0.90
75- use_reasoning : true
76- mmlu_categories :
77- - " mathematics"
78- - " statistics"
79-
80- - name : " coding"
81- description : " Programming, software development, debugging, algorithms"
82- model_scores :
83- - model : " deepseek/deepseek-r1-coder:33b"
84- score : 0.95
85- use_reasoning : true
86- - model : " meta/llama3.1-70b"
87- score : 0.85
88- use_reasoning : false
89- mmlu_categories :
90- - " computer_science"
91- - " engineering"
92-
93- - name : " general"
94- description : " General knowledge, conversation, misc queries"
95- model_scores :
96- - model : " meta/llama3.1-70b"
97- score : 0.90
98- use_reasoning : false
99- - model : " qwen/qwen3-235b"
100- score : 0.85
101- use_reasoning : false
38+ #
39+ # Categories are automatically loaded from MCP server via 'list_categories' tool.
40+ # The MCP server controls BOTH classification AND routing decisions.
41+ #
42+ # How it works:
43+ # 1. Router connects to MCP server at startup
44+ # 2. Calls 'list_categories' tool: MCP returns {"categories": ["business", "law", ...]}
45+ # 3. For each request, calls 'classify_text' tool which returns:
46+ # {
47+ # "class": 3,
48+ # "confidence": 0.85,
49+ # "model": "openai/gpt-oss-20b", # MCP decides which model to use
50+ # "use_reasoning": true # MCP decides whether to use reasoning
51+ # }
52+ # 4. Router uses the model and reasoning settings from MCP response
53+ #
54+ # BENEFITS:
55+ # - MCP server makes intelligent routing decisions per query
56+ # - No hardcoded routing rules needed in config
57+ # - MCP can adapt routing based on query complexity, content, etc.
58+ # - Centralized routing logic in MCP server
59+ #
60+ # FALLBACK:
61+ # - If MCP doesn't return model/use_reasoning, uses default_model below
62+ # - Can also add category-specific overrides here if needed
63+ #
64+ categories : []
10265
10366# Default model to use when category can't be determined
104- default_model : " meta/llama3.1-70b "
67+ default_model : openai/gpt-oss-20b
10568
10669# vLLM endpoints configuration
10770vllm_endpoints :
108- - name : " deepseek-endpoint "
109- address : " 10 .0.1.10 "
71+ - name : endpoint1
72+ address : 127 .0.0.1
11073 port : 8000
11174 models :
112- - " deepseek/deepseek-r1:70b"
113- - " deepseek/deepseek-r1-coder:33b"
114- weight : 100
115-
116- - name : " qwen-endpoint"
117- address : " 10.0.1.11"
118- port : 8000
119- models :
120- - " qwen/qwen3-235b"
121- weight : 100
122-
123- - name : " llama-endpoint"
124- address : " 10.0.1.12"
125- port : 8000
126- models :
127- - " meta/llama3.1-70b"
128- weight : 100
129-
130- # Semantic cache configuration (optional)
131- semantic_cache :
132- enabled : true
133- backend_type : " in-memory"
134- similarity_threshold : 0.90
135- max_entries : 1000
136- ttl_seconds : 3600
137- eviction_policy : " lru"
75+ - openai/gpt-oss-20b
76+ weight : 1
77+ health_check_path : /health
13878
13979# Model-specific configuration
14080model_config :
141- " deepseek/deepseek-r1:70b " :
142- reasoning_family : " deepseek"
143- pii_policy :
144- allow_by_default : false
145- pii_types_allowed : []
146-
147- " deepseek/deepseek-r1-coder:33b " :
148- reasoning_family : " deepseek"
149- pii_policy :
150- allow_by_default : false
151- pii_types_allowed : []
152-
153- " qwen/qwen3-235b " :
154- reasoning_family : " qwen3"
155- pii_policy :
156- allow_by_default : true
157-
158- " meta/llama3.1-70b " :
81+ openai/gpt-oss-20b :
82+ reasoning_family : gpt-oss
83+ preferred_endpoints :
84+ - endpoint1
15985 pii_policy :
16086 allow_by_default : true
16187
16288# Reasoning family configurations
16389reasoning_families :
16490 deepseek :
165- type : " chat_template_kwargs"
166- parameter : " thinking"
91+ type : chat_template_kwargs
92+ parameter : thinking
16793 qwen3 :
168- type : " reasoning_effort "
169- parameter : " reasoning_effort "
94+ type : chat_template_kwargs
95+ parameter : enable_thinking
17096 gpt-oss :
171- type : " chat_template_kwargs"
172- parameter : " enable_thinking"
97+ type : reasoning_effort
98+ parameter : reasoning_effort
99+ gpt :
100+ type : reasoning_effort
101+ parameter : reasoning_effort
102+
103+ # Default reasoning effort level
104+ default_reasoning_effort : high
173105
174106# Tools configuration (optional)
175107tools :
@@ -182,9 +114,37 @@ tools:
182114# API configuration
183115api :
184116 batch_classification :
117+ max_batch_size : 100
118+ concurrency_threshold : 5
119+ max_concurrency : 8
185120 metrics :
186121 enabled : true
122+ detailed_goroutine_tracking : true
123+ high_resolution_timing : false
187124 sample_rate : 1.0
125+ duration_buckets :
126+ - 0.001
127+ - 0.005
128+ - 0.01
129+ - 0.025
130+ - 0.05
131+ - 0.1
132+ - 0.25
133+ - 0.5
134+ - 1
135+ - 2.5
136+ - 5
137+ - 10
138+ - 30
139+ size_buckets :
140+ - 1
141+ - 2
142+ - 5
143+ - 10
144+ - 20
145+ - 50
146+ - 100
147+ - 200
188148
189149# Observability configuration
190150observability :
0 commit comments