address comments

gary-huang · Yun-Kim · gary-huang · commit c96fbafe4191 · 2025-11-17T12:55:24.000-05:00
Update releasenotes/notes/llmobs-dne-experiments-multi-run-ef099e98a5827e49.yaml

Co-authored-by: Yun Kim &lt;35776586+Yun-Kim@users.noreply.github.com&gt;

Update ddtrace/llmobs/_llmobs.py

Co-authored-by: Yun Kim &lt;35776586+Yun-Kim@users.noreply.github.com&gt;
diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
@@ -403,8 +403,6 @@ def run(self, jobs: int = 1, raise_errors: bool = False, sample_size: Optional[i
         self._run_name = experiment_run_name
         run_results = []
         # for backwards compatibility
-        first_run_rows = []
-        first_run_summary_evals = {}
         for run_iteration in range(self._runs):
             run = _ExperimentRunInfo(run_iteration)
             self._tags["run_id"] = str(run._id)
@@ -418,13 +416,11 @@ def run(self, jobs: int = 1, raise_errors: bool = False, sample_size: Optional[i
                 self._id, experiment_evals, convert_tags_dict_to_list(self._tags)
             )
             run_results.append(run_result)
-            if run_iteration == 0:
-                first_run_rows = run_result.rows
-                first_run_summary_evals = run_result.summary_evaluations
 
         experiment_result: ExperimentResult = {
-            "summary_evaluations": first_run_summary_evals,
-            "rows": first_run_rows,
+            # for backwards compatibility, the first result fills the old fields of rows and summary evals
+            "summary_evaluations": run_results[0].summary_evaluations if len(run_results) > 0 else {},
+            "rows": run_results[0].rows if len(run_results) > 0 else [],
             "runs": run_results,
         }
         return experiment_result
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
@@ -456,9 +456,7 @@ def _llmobs_tags(span: Span, ml_app: str, session_id: Optional[str] = None) -> L
 
         # set experiment tags on children spans if the tags do not already exist
         experiment_id = span.context.get_baggage_item(EXPERIMENT_ID_KEY)
-        if experiment_id:
-            # the children spans of an experiment span should be tagged by the experiment ID as well
-            if "experiment_id" not in tags:
+        if experiment_id and "experiment_id" not in tags:
                 tags["experiment_id"] = experiment_id
 
         run_id = span.context.get_baggage_item(EXPERIMENT_RUN_ID_KEY)
diff --git a/releasenotes/notes/llmobs-dne-experiments-multi-run-ef099e98a5827e49.yaml b/releasenotes/notes/llmobs-dne-experiments-multi-run-ef099e98a5827e49.yaml
@@ -2,11 +2,10 @@
 features:
   - |
     LLM Observability: Experiments can now be run multiple times by using the optional ``runs`` argument, 
-    to assess the true performance of an experiment in the face of the non determinism of LLMs
+    to assess the true performance of an experiment in the face of the non determinism of LLMs. Use the new ``ExperimentResult`` class' ``runs`` attribute to access the results by run iteration.
 deprecations:
   - |
-    LLM Observability: The ``ExperimentResult`` class now has a new ``runs`` attribute to store the results of
-    every experiment run. The ``rows`` and ``summary_evaluations`` attributes will only store the results from the first run
+    LLM Observability: The ``ExperimentResult`` class' ``rows`` and ``summary_evaluations`` attributes are deprecated and will be removed in the next major release. ``ExperimentResult.rows/summary_evaluations`` attributes will only store the results of the first run iteration for multi-run experiments. Use the ``ExperimentResult.runs`` attribute instead to access experiment results and summary evaluations.
 fixes:
   - |
-    LLM Observability: experiment children span now have experiment related tags
+    LLM Observability: Non-root experiment spans are now tagged with experiment ID, run ID, and run iteration tags.