Skip to content

Commit de3c8c8

Browse files
authored
[InferenceSnippets] Document model:provider syntax (#1636)
This PR updates the inference snippets to take showcase `https://router.huggingface.co/v1` "auto" route + the new syntax to select a model+provider e.g. `model="meta-llama/Llama-3.1-8B-Instruct:together"`. Many details to take into account but I reviewed all the examples one by one and I think they're good now. Once merged, we won't need any modification in moon-landing code.
1 parent 18b62dc commit de3c8c8

File tree

45 files changed

+116
-87
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+116
-87
lines changed

packages/inference/src/snippets/getInferenceSnippets.ts

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ interface TemplateParams {
4949
fullUrl?: string;
5050
inputs?: object;
5151
providerInputs?: object;
52+
autoInputs?: object;
5253
model?: ModelDataMinimal;
5354
provider?: InferenceProviderOrPolicy;
5455
providerModelId?: string;
@@ -202,12 +203,28 @@ const snippetGenerator = (templateName: string, inputPreparationFn?: InputPrepar
202203
}
203204
}
204205

206+
// Inputs for the "auto" route is strictly the same as "inputs", except the model includes the provider
207+
// If not "auto" route, use the providerInputs
208+
const autoInputs =
209+
provider !== "auto" && !opts?.endpointUrl && !opts?.directRequest
210+
? {
211+
...inputs,
212+
model: `${model.id}:${provider}`,
213+
}
214+
: providerInputs;
215+
205216
/// Prepare template injection data
206217
const params: TemplateParams = {
207218
accessToken: accessTokenOrPlaceholder,
208219
authorizationHeader: (request.info.headers as Record<string, string>)?.Authorization,
209-
baseUrl: removeSuffix(request.url, "/chat/completions"),
210-
fullUrl: request.url,
220+
baseUrl:
221+
task === "conversational" && !opts?.endpointUrl && !opts?.directRequest
222+
? HF_ROUTER_AUTO_ENDPOINT
223+
: removeSuffix(request.url, "/chat/completions"),
224+
fullUrl:
225+
task === "conversational" && !opts?.endpointUrl && !opts?.directRequest
226+
? HF_ROUTER_AUTO_ENDPOINT + "/chat/completions"
227+
: request.url,
211228
inputs: {
212229
asObj: inputs,
213230
asCurlString: formatBody(inputs, "curl"),
@@ -222,9 +239,21 @@ const snippetGenerator = (templateName: string, inputPreparationFn?: InputPrepar
222239
asPythonString: formatBody(providerInputs, "python"),
223240
asTsString: formatBody(providerInputs, "ts"),
224241
},
242+
autoInputs: {
243+
asObj: autoInputs,
244+
asCurlString: formatBody(autoInputs, "curl"),
245+
asJsonString: formatBody(autoInputs, "json"),
246+
asPythonString: formatBody(autoInputs, "python"),
247+
asTsString: formatBody(autoInputs, "ts"),
248+
},
225249
model,
226250
provider,
227-
providerModelId: providerModelId ?? model.id,
251+
providerModelId:
252+
task === "conversational" && !opts?.endpointUrl && !opts?.directRequest
253+
? provider !== "auto"
254+
? `${model.id}:${provider}` // e.g. "moonshotai/Kimi-K2-Instruct:groq"
255+
: model.id
256+
: providerModelId ?? model.id,
228257
billTo: opts?.billTo,
229258
endpointUrl: opts?.endpointUrl,
230259
};

packages/inference/src/snippets/templates/python/requests/conversational.jinja

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ def query(payload):
33
return response.json()
44

55
response = query({
6-
{{ providerInputs.asJsonString }}
6+
{{ autoInputs.asJsonString }}
77
})
88

99
print(response["choices"][0]["message"])

packages/inference/src/snippets/templates/python/requests/conversationalStream.jinja

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def query(payload):
88
yield json.loads(line.decode("utf-8").lstrip("data:").rstrip("/n"))
99

1010
chunks = query({
11-
{{ providerInputs.asJsonString }},
11+
{{ autoInputs.asJsonString }},
1212
"stream": True,
1313
})
1414

packages/inference/src/snippets/templates/sh/curl/conversational.jinja

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ curl {{ fullUrl }} \
55
-H 'X-HF-Bill-To: {{ billTo }}' \
66
{% endif %}
77
-d '{
8-
{{ providerInputs.asCurlString }},
8+
{{ autoInputs.asCurlString }},
99
"stream": false
1010
}'

packages/inference/src/snippets/templates/sh/curl/conversationalStream.jinja

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ curl {{ fullUrl }} \
55
-H 'X-HF-Bill-To: {{ billTo }}' \
66
{% endif %}
77
-d '{
8-
{{ providerInputs.asCurlString }},
8+
{{ autoInputs.asCurlString }},
99
"stream": true
1010
}'

packages/tasks-gen/snippets-fixtures/bill-to-param/js/openai/0.hf-inference.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
import { OpenAI } from "openai";
22

33
const client = new OpenAI({
4-
baseURL: "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
4+
baseURL: "https://router.huggingface.co/v1",
55
apiKey: process.env.HF_TOKEN,
66
defaultHeaders: {
77
"X-HF-Bill-To": "huggingface"
88
}
99
});
1010

1111
const chatCompletion = await client.chat.completions.create({
12-
model: "meta-llama/Llama-3.1-8B-Instruct",
12+
model: "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
1313
messages: [
1414
{
1515
role: "user",

packages/tasks-gen/snippets-fixtures/bill-to-param/python/openai/0.hf-inference.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
from openai import OpenAI
33

44
client = OpenAI(
5-
base_url="https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
5+
base_url="https://router.huggingface.co/v1",
66
api_key=os.environ["HF_TOKEN"],
77
default_headers={
88
"X-HF-Bill-To": "huggingface"
99
}
1010
)
1111

1212
completion = client.chat.completions.create(
13-
model="meta-llama/Llama-3.1-8B-Instruct",
13+
model="meta-llama/Llama-3.1-8B-Instruct:hf-inference",
1414
messages=[
1515
{
1616
"role": "user",

packages/tasks-gen/snippets-fixtures/bill-to-param/python/requests/0.hf-inference.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
import requests
33

4-
API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1/chat/completions"
4+
API_URL = "https://router.huggingface.co/v1/chat/completions"
55
headers = {
66
"Authorization": f"Bearer {os.environ['HF_TOKEN']}",
77
"X-HF-Bill-To": "huggingface"
@@ -18,7 +18,7 @@ def query(payload):
1818
"content": "What is the capital of France?"
1919
}
2020
],
21-
"model": "meta-llama/Llama-3.1-8B-Instruct"
21+
"model": "meta-llama/Llama-3.1-8B-Instruct:hf-inference"
2222
})
2323

2424
print(response["choices"][0]["message"])

packages/tasks-gen/snippets-fixtures/bill-to-param/sh/curl/0.hf-inference.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
curl https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1/chat/completions \
1+
curl https://router.huggingface.co/v1/chat/completions \
22
-H "Authorization: Bearer $HF_TOKEN" \
33
-H 'Content-Type: application/json' \
44
-H 'X-HF-Bill-To: huggingface' \
@@ -9,6 +9,6 @@ curl https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-I
99
"content": "What is the capital of France?"
1010
}
1111
],
12-
"model": "meta-llama/Llama-3.1-8B-Instruct",
12+
"model": "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
1313
"stream": false
1414
}'

packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/js/openai/0.hf-inference.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import { OpenAI } from "openai";
22

33
const client = new OpenAI({
4-
baseURL: "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
4+
baseURL: "https://router.huggingface.co/v1",
55
apiKey: process.env.HF_TOKEN,
66
});
77

88
const chatCompletion = await client.chat.completions.create({
9-
model: "meta-llama/Llama-3.1-8B-Instruct",
9+
model: "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
1010
messages: [
1111
{
1212
role: "user",

0 commit comments

Comments
 (0)