Skip to content

Commit 6934abd

Browse files
committed
add test and address comments
1 parent f272112 commit 6934abd

9 files changed

+290
-5
lines changed

ddtrace/llmobs/_experiment.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def __init__(
118118

119119

120120
class ExperimentResult(TypedDict):
121+
# TODO: remove these fields (summary_evaluations, rows) in the next major release (5.x)
121122
summary_evaluations: Dict[str, Dict[str, JSONType]]
122123
rows: List[ExperimentRowResult]
123124
runs: List[ExperimentRun]

ddtrace/llmobs/_llmobs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ def _llmobs_tags(span: Span, ml_app: str, session_id: Optional[str] = None) -> L
457457
# set experiment tags on children spans if the tags do not already exist
458458
experiment_id = span.context.get_baggage_item(EXPERIMENT_ID_KEY)
459459
if experiment_id and "experiment_id" not in tags:
460-
tags["experiment_id"] = experiment_id
460+
tags["experiment_id"] = experiment_id
461461

462462
run_id = span.context.get_baggage_item(EXPERIMENT_RUN_ID_KEY)
463463
if run_id and "run_id" not in tags:

releasenotes/notes/llmobs-dne-experiments-multi-run-ef099e98a5827e49.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
features:
33
- |
44
LLM Observability: Experiments can now be run multiple times by using the optional ``runs`` argument,
5-
to assess the true performance of an experiment in the face of the non determinism of LLMs. Use the new ``ExperimentResult`` class' ``runs`` attribute to access the results by run iteration.
5+
to assess the true performance of an experiment in the face of the non determinism of LLMs. Use the new ``ExperimentResult`` class' ``runs`` attribute to access the results and summary evaluations by run iteration.
6+
- |
7+
LLM Observability: Non-root experiment spans are now tagged with experiment ID, run ID, and run iteration tags.
68
deprecations:
79
- |
810
LLM Observability: The ``ExperimentResult`` class' ``rows`` and ``summary_evaluations`` attributes are deprecated and will be removed in the next major release. ``ExperimentResult.rows/summary_evaluations`` attributes will only store the results of the first run iteration for multi-run experiments. Use the ``ExperimentResult.runs`` attribute instead to access experiment results and summary evaluations.
9-
fixes:
10-
- |
11-
LLM Observability: Non-root experiment spans are now tagged with experiment ID, run ID, and run iteration tags.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
4+
"metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
5+
"timestamp_ms": 1234, "metric_type": "score", "label": "dummy_evaluator", "score_value":
6+
0, "error": null, "tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114",
7+
"experiment_id:b1d96a7b-aea5-48a6-9bff-44a4d66e5788", "run_id:12345678-abcd-abcd-abcd-123456789012",
8+
"run_iteration:1"], "experiment_id": "b1d96a7b-aea5-48a6-9bff-44a4d66e5788"}],
9+
"tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114", "experiment_id:b1d96a7b-aea5-48a6-9bff-44a4d66e5788",
10+
"run_id:12345678-abcd-abcd-abcd-123456789012", "run_iteration:1"]}}}'
11+
headers:
12+
Accept:
13+
- '*/*'
14+
? !!python/object/apply:multidict._multidict.istr
15+
- Accept-Encoding
16+
: - identity
17+
Connection:
18+
- keep-alive
19+
Content-Length:
20+
- '682'
21+
? !!python/object/apply:multidict._multidict.istr
22+
- Content-Type
23+
: - application/json
24+
User-Agent:
25+
- python-requests/2.32.3
26+
method: POST
27+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/b1d96a7b-aea5-48a6-9bff-44a4d66e5788/events
28+
response:
29+
body:
30+
string: ''
31+
headers:
32+
content-length:
33+
- '0'
34+
content-security-policy:
35+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
36+
content-type:
37+
- application/vnd.api+json
38+
date:
39+
- Mon, 17 Nov 2025 07:47:20 GMT
40+
strict-transport-security:
41+
- max-age=31536000; includeSubDomains; preload
42+
vary:
43+
- Accept-Encoding
44+
x-content-type-options:
45+
- nosniff
46+
x-frame-options:
47+
- SAMEORIGIN
48+
status:
49+
code: 202
50+
message: Accepted
51+
version: 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
4+
"metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
5+
"timestamp_ms": 1234, "metric_type": "score", "label": "dummy_evaluator", "score_value":
6+
0, "error": null, "tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114",
7+
"experiment_id:b1d96a7b-aea5-48a6-9bff-44a4d66e5788", "run_id:12345678-abcd-abcd-abcd-123456789012",
8+
"run_iteration:1"], "experiment_id": "b1d96a7b-aea5-48a6-9bff-44a4d66e5788"},
9+
{"metric_source": "summary", "span_id": "", "trace_id": "", "timestamp_ms":
10+
1234, "metric_type": "score", "label": "dummy_summary_evaluator", "score_value":
11+
4, "error": null, "tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114",
12+
"experiment_id:b1d96a7b-aea5-48a6-9bff-44a4d66e5788", "run_id:12345678-abcd-abcd-abcd-123456789012",
13+
"run_iteration:1"], "experiment_id": "b1d96a7b-aea5-48a6-9bff-44a4d66e5788"}],
14+
"tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114", "experiment_id:b1d96a7b-aea5-48a6-9bff-44a4d66e5788",
15+
"run_id:12345678-abcd-abcd-abcd-123456789012", "run_iteration:1"]}}}'
16+
headers:
17+
Accept:
18+
- '*/*'
19+
? !!python/object/apply:multidict._multidict.istr
20+
- Accept-Encoding
21+
: - identity
22+
Connection:
23+
- keep-alive
24+
Content-Length:
25+
- '1098'
26+
? !!python/object/apply:multidict._multidict.istr
27+
- Content-Type
28+
: - application/json
29+
User-Agent:
30+
- python-requests/2.32.3
31+
method: POST
32+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/b1d96a7b-aea5-48a6-9bff-44a4d66e5788/events
33+
response:
34+
body:
35+
string: ''
36+
headers:
37+
content-length:
38+
- '0'
39+
content-security-policy:
40+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
41+
content-type:
42+
- application/vnd.api+json
43+
date:
44+
- Mon, 17 Nov 2025 07:47:22 GMT
45+
strict-transport-security:
46+
- max-age=31536000; includeSubDomains; preload
47+
vary:
48+
- Accept-Encoding
49+
x-content-type-options:
50+
- nosniff
51+
x-frame-options:
52+
- SAMEORIGIN
53+
status:
54+
code: 202
55+
message: Accepted
56+
version: 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
4+
"metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
5+
"timestamp_ms": 1234, "metric_type": "score", "label": "dummy_evaluator", "score_value":
6+
0, "error": null, "tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114",
7+
"experiment_id:fab62630-6e2a-4c5f-9e05-26e601f0bc08", "run_id:12345678-abcd-abcd-abcd-123456789012",
8+
"run_iteration:1"], "experiment_id": "fab62630-6e2a-4c5f-9e05-26e601f0bc08"}],
9+
"tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114", "experiment_id:fab62630-6e2a-4c5f-9e05-26e601f0bc08",
10+
"run_id:12345678-abcd-abcd-abcd-123456789012", "run_iteration:1"]}}}'
11+
headers:
12+
Accept:
13+
- '*/*'
14+
? !!python/object/apply:multidict._multidict.istr
15+
- Accept-Encoding
16+
: - identity
17+
Connection:
18+
- keep-alive
19+
Content-Length:
20+
- '682'
21+
? !!python/object/apply:multidict._multidict.istr
22+
- Content-Type
23+
: - application/json
24+
User-Agent:
25+
- python-requests/2.32.3
26+
method: POST
27+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/fab62630-6e2a-4c5f-9e05-26e601f0bc08/events
28+
response:
29+
body:
30+
string: ''
31+
headers:
32+
content-length:
33+
- '0'
34+
content-security-policy:
35+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
36+
content-type:
37+
- application/vnd.api+json
38+
date:
39+
- Mon, 17 Nov 2025 07:47:21 GMT
40+
strict-transport-security:
41+
- max-age=31536000; includeSubDomains; preload
42+
vary:
43+
- Accept-Encoding
44+
x-content-type-options:
45+
- nosniff
46+
x-frame-options:
47+
- SAMEORIGIN
48+
status:
49+
code: 202
50+
message: Accepted
51+
version: 1
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"name": "test_experiment",
4+
"description": "", "dataset_id": "0969efc9-f104-45cc-b955-25b329e91293", "project_id":
5+
"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9", "dataset_version": 1, "config": {},
6+
"metadata": {"tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114"]},
7+
"ensure_unique": true, "run_count": 1}}}'
8+
headers:
9+
Accept:
10+
- '*/*'
11+
? !!python/object/apply:multidict._multidict.istr
12+
- Accept-Encoding
13+
: - identity
14+
Connection:
15+
- keep-alive
16+
Content-Length:
17+
- '355'
18+
? !!python/object/apply:multidict._multidict.istr
19+
- Content-Type
20+
: - application/json
21+
User-Agent:
22+
- python-requests/2.32.3
23+
method: POST
24+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
25+
response:
26+
body:
27+
string: '{"data":{"id":"b1d96a7b-aea5-48a6-9bff-44a4d66e5788","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2025-11-17T07:47:20.335980528Z","dataset_id":"0969efc9-f104-45cc-b955-25b329e91293","dataset_version":1,"description":"","experiment":"test_experiment","metadata":{"tags":["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114"]},"name":"test_experiment-1763365640335","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-11-17T07:47:20.335980602Z"}}}'
28+
headers:
29+
content-length:
30+
- '534'
31+
content-security-policy:
32+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
33+
content-type:
34+
- application/vnd.api+json
35+
date:
36+
- Mon, 17 Nov 2025 07:47:20 GMT
37+
strict-transport-security:
38+
- max-age=31536000; includeSubDomains; preload
39+
vary:
40+
- Accept-Encoding
41+
x-content-type-options:
42+
- nosniff
43+
x-frame-options:
44+
- SAMEORIGIN
45+
status:
46+
code: 200
47+
message: OK
48+
version: 1
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"name": "test_experiment",
4+
"description": "", "dataset_id": "0969efc9-f104-45cc-b955-25b329e91293", "project_id":
5+
"c4b49fb5-7b16-46e1-86f0-de5800e8a56c", "dataset_version": 1, "config": {},
6+
"metadata": {"tags": ["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114"]},
7+
"ensure_unique": true, "run_count": 1}}}'
8+
headers:
9+
Accept:
10+
- '*/*'
11+
? !!python/object/apply:multidict._multidict.istr
12+
- Accept-Encoding
13+
: - identity
14+
Connection:
15+
- keep-alive
16+
Content-Length:
17+
- '355'
18+
? !!python/object/apply:multidict._multidict.istr
19+
- Content-Type
20+
: - application/json
21+
User-Agent:
22+
- python-requests/2.32.3
23+
method: POST
24+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
25+
response:
26+
body:
27+
string: '{"data":{"id":"fab62630-6e2a-4c5f-9e05-26e601f0bc08","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2025-11-17T07:47:21.297487816Z","dataset_id":"0969efc9-f104-45cc-b955-25b329e91293","dataset_version":1,"description":"","experiment":"test_experiment","metadata":{"tags":["ddtrace.version:3.19.0.dev42+g1f1eda22d.d20251114"]},"name":"test_experiment-1763365641297","project_id":"c4b49fb5-7b16-46e1-86f0-de5800e8a56c","updated_at":"2025-11-17T07:47:21.29748789Z"}}}'
28+
headers:
29+
content-length:
30+
- '533'
31+
content-security-policy:
32+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
33+
content-type:
34+
- application/vnd.api+json
35+
date:
36+
- Mon, 17 Nov 2025 07:47:21 GMT
37+
strict-transport-security:
38+
- max-age=31536000; includeSubDomains; preload
39+
vary:
40+
- Accept-Encoding
41+
x-content-type-options:
42+
- nosniff
43+
x-frame-options:
44+
- SAMEORIGIN
45+
status:
46+
code: 200
47+
message: OK
48+
version: 1

tests/llmobs/test_experiments.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
from typing import List
1919
from typing import Optional
2020
from unittest.mock import MagicMock
21+
from uuid import UUID
2122

2223
import mock
2324
import pytest
2425

26+
import ddtrace
2527
from ddtrace.llmobs._experiment import Dataset
2628
from ddtrace.llmobs._experiment import DatasetRecord
2729
from ddtrace.llmobs._experiment import _ExperimentRunInfo
@@ -64,6 +66,9 @@ def dummy_summary_evaluator_using_missing_eval_results(inputs, outputs, expected
6466
return len(inputs) + len(outputs) + len(expected_outputs) + len(evaluators_results["non_existent_evaluator"])
6567

6668

69+
DUMMY_EXPERIMENT_FIRST_RUN_ID = UUID("12345678-abcd-abcd-abcd-123456789012")
70+
71+
6772
def run_info_with_stable_id(iteration: int, run_id: Optional[str] = None) -> _ExperimentRunInfo:
6873
eri = _ExperimentRunInfo(iteration)
6974
eri._id = "12345678-abcd-abcd-abcd-123456789012"
@@ -1635,4 +1640,30 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test
16351640
assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"]
16361641
assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"]
16371642
assert "experiment_id:1234567890" in event["tags"]
1643+
assert f"run_id:{DUMMY_EXPERIMENT_FIRST_RUN_ID}" in event["tags"]
1644+
assert "run_iteration:1" in event["tags"]
1645+
assert f"ddtrace.version:{ddtrace.__version__}" in event["tags"]
16381646
assert event["_dd"]["scope"] == "experiments"
1647+
1648+
1649+
def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_record):
1650+
exp = llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [dummy_evaluator])
1651+
exp._id = "1234567890"
1652+
for i in range(2):
1653+
exp._run_task(1, run=run_info_with_stable_id(i), raise_errors=False)
1654+
assert len(llmobs_events) == i + 1
1655+
event = llmobs_events[i]
1656+
assert event["name"] == "dummy_task"
1657+
for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"):
1658+
assert event[key] == mock.ANY
1659+
assert event["status"] == "ok"
1660+
assert event["meta"]["input"] == '{"prompt": "What is the capital of France?"}'
1661+
assert event["meta"]["output"] == '{"prompt": "What is the capital of France?"}'
1662+
assert event["meta"]["expected_output"] == '{"answer": "Paris"}'
1663+
assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"]
1664+
assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"]
1665+
assert "experiment_id:1234567890" in event["tags"]
1666+
assert f"run_id:{DUMMY_EXPERIMENT_FIRST_RUN_ID}" in event["tags"]
1667+
assert f"run_iteration:{i + 1}" in event["tags"]
1668+
assert f"ddtrace.version:{ddtrace.__version__}" in event["tags"]
1669+
assert event["_dd"]["scope"] == "experiments"

0 commit comments

Comments
 (0)