diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 95992e59139..43015f65c1e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -155,6 +155,7 @@ ddtrace/contrib/internal/litellm @DataDog/ml-observ ddtrace/contrib/internal/pydantic_ai @DataDog/ml-observability ddtrace/contrib/internal/ray @DataDog/ml-observability ddtrace/contrib/internal/mcp @DataDog/ml-observability +ddtrace/contrib/internal/vllm @DataDog/ml-observability tests/llmobs @DataDog/ml-observability tests/contrib/openai @DataDog/ml-observability tests/contrib/langchain @DataDog/ml-observability @@ -176,6 +177,7 @@ tests/contrib/litellm @DataDog/ml-observ tests/contrib/pydantic_ai @DataDog/ml-observability tests/contrib/ray @DataDog/ml-observability tests/contrib/mcp @DataDog/ml-observability +tests/contrib/vllm @DataDog/ml-observability .gitlab/tests/llmobs.yml @DataDog/ml-observability # MLObs snapshot tests tests/snapshots/tests.contrib.anthropic.* @DataDog/ml-observability @@ -190,6 +192,7 @@ tests/snapshots/tests.contrib.langgraph.* @DataDog/ml-observ tests/snapshots/tests.contrib.crewai.* @DataDog/ml-observability tests/snapshots/tests.contrib.openai_agents.* @DataDog/ml-observability tests/snapshots/tests.contrib.litellm.* @DataDog/ml-observability +tests/snapshots/tests.contrib.vllm.* @DataDog/ml-observability # Remote Config ddtrace/internal/remoteconfig @DataDog/remote-config @DataDog/apm-core-python diff --git a/.gitlab/testrunner.yml b/.gitlab/testrunner.yml index bf90fbd088c..598d187b92e 100644 --- a/.gitlab/testrunner.yml +++ b/.gitlab/testrunner.yml @@ -1,5 +1,6 @@ variables: TESTRUNNER_IMAGE: registry.ddbuild.io/dd-trace-py:v73166438-4077fc2-testrunner-2025.08.08@sha256:456e32d2fdc19569fb34d705d1ccf193c179cc5364d1f93e60f825d189647c3d + TESTRUNNER_GPU_IMAGE: ${TESTRUNNER_IMAGE} .testrunner: image: @@ -18,3 +19,23 @@ variables: artifacts: reports: junit: test-results/junit*.xml + +.testrunner_gpu: + image: + name: ${TESTRUNNER_GPU_IMAGE} + docker: + user: bits + # Use GPU-enabled runners + tags: [ "gpu:a10-amd64" ] + timeout: 40m + before_script: + - ulimit -c unlimited + - git config --global --add safe.directory ${CI_PROJECT_DIR} + - pyenv global 3.12 3.8 3.9 3.10 3.11 3.13 + - export _CI_DD_AGENT_URL=http://${HOST_IP}:8126/ + - export NVIDIA_VISIBLE_DEVICES=all + - export NVIDIA_DRIVER_CAPABILITIES=compute,utility + retry: 2 + artifacts: + reports: + junit: test-results/junit*.xml diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml index 1c8d9365271..7e9988db8bd 100644 --- a/.gitlab/tests.yml +++ b/.gitlab/tests.yml @@ -43,7 +43,6 @@ include: DD_FAST_BUILD = "1" - .test_base_hatch_snapshot: extends: .test_base_hatch services: @@ -54,6 +53,49 @@ include: # agent at that host. Therefore setting this as a variable will cause recursive requests to the testagent - export DD_TRACE_AGENT_URL="http://testagent:9126" +# GPU variants of base templates +.test_base_hatch_gpu: + extends: .testrunner_gpu + stage: hatch + needs: [ prechecks ] + parallel: 2 + variables: + # Request==Limit to achieve Guaranteed QoS; requires Kubernetes executor support + KUBERNETES_MEMORY_REQUEST: "12Gi" + KUBERNETES_MEMORY_LIMIT: "12Gi" + KUBERNETES_CPU_REQUEST: "2" + KUBERNETES_CPU_LIMIT: "2" + before_script: + - !reference [.testrunner_gpu, before_script] + script: + - export PYTEST_ADDOPTS="${PYTEST_ADDOPTS} --ddtrace" + - export _DD_CIVISIBILITY_USE_CI_CONTEXT_PROVIDER=true + - export DD_FAST_BUILD="1" + - | + envs=( $(hatch env show --json | jq -r --arg suite_name "$SUITE_NAME" 'keys[] | select(. | contains($suite_name))' | sort | ./.gitlab/ci-split-input.sh) ) + if [[ ${#envs[@]} -eq 0 ]]; then + echo "No hatch envs found for ${SUITE_NAME}" + exit 1 + fi + for env in "${envs[@]}" + do + echo "Running hatch env (GPU): ${env}:test" + hatch run ${env}:test + done + variables: + CMAKE_BUILD_PARALLEL_LEVEL = "12" + CARGO_BUILD_JOBS = "12" + DD_FAST_BUILD = "1" + PIP_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cu124" + +.test_base_hatch_gpu_snapshot: + extends: .test_base_hatch_gpu + services: + - !reference [.services, testagent] + before_script: + - !reference [.test_base_hatch_gpu, before_script] + - export DD_TRACE_AGENT_URL="http://testagent:9126" + # Do not define a `needs:` in order to depend on the whole `precheck` stage .test_base_riot: extends: .testrunner @@ -101,4 +143,51 @@ include: - export DD_TRACE_AGENT_URL="http://testagent:9126" - ln -s "${CI_PROJECT_DIR}" "/home/bits/project" +.test_base_riot_gpu: + extends: .testrunner_gpu + stage: riot + needs: [ build_base_venvs, prechecks ] + parallel: 2 + variables: + KUBERNETES_MEMORY_REQUEST: "12Gi" + KUBERNETES_MEMORY_LIMIT: "12Gi" + KUBERNETES_CPU_REQUEST: "2" + KUBERNETES_CPU_LIMIT: "2" + services: + - !reference [.services, ddagent] + before_script: + - !reference [.testrunner_gpu, before_script] + - unset DD_SERVICE + - unset DD_ENV + - unset DD_TAGS + - unset DD_TRACE_REMOVE_INTEGRATION_SERVICE_NAMES_ENABLED + script: + - | + hashes=( $(.gitlab/scripts/get-riot-hashes.sh "${SUITE_NAME}") ) + if [[ ${#hashes[@]} -eq 0 ]]; then + echo "No riot hashes found for ${SUITE_NAME}" + exit 1 + fi + for hash in "${hashes[@]}" + do + echo "Running riot hash (GPU): ${hash}" + riot list "${hash}" + export _CI_DD_TAGS="test.configuration.riot_hash:${hash}" + ${RIOT_RUN_CMD} "${hash}" -- --ddtrace + done + ./scripts/check-diff ".riot/requirements/" \ + "Changes detected after running riot. Consider deleting changed files, running scripts/compile-and-prune-test-requirements and committing the result." + ./scripts/check-diff "ddtrace/contrib/integration_registry/registry.yaml" \ + "Registry YAML file (ddtrace/contrib/integration_registry/registry.yaml) was modified. Please run: scripts/integration_registry/update_and_format_registry.py and commit the changes." + +.test_base_riot_gpu_snapshot: + extends: .test_base_riot_gpu + services: + - !reference [.test_base_riot_gpu, services] + - !reference [.services, testagent] + before_script: + - !reference [.test_base_riot_gpu, before_script] + - export DD_TRACE_AGENT_URL="http://testagent:9126" + - ln -s "${CI_PROJECT_DIR}" "/home/bits/project" + # Required jobs will appear here diff --git a/.riot/requirements/2043c14.txt b/.riot/requirements/2043c14.txt new file mode 100644 index 00000000000..d72b1a57e39 --- /dev/null +++ b/.riot/requirements/2043c14.txt @@ -0,0 +1,163 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --cert=None --client-cert=None --index-url=None --no-annotate --pip-args=None .riot/requirements/2043c14.in +# +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +astor==0.8.1 +async-timeout==5.0.1 +attrs==25.3.0 +blake3==1.0.7 +cachetools==6.2.0 +cbor2==5.7.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +compressed-tensors==0.11.0 +coverage[toml]==7.10.7 +cupy-cuda12x==13.6.0 +depyf==0.19.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +einops==0.8.1 +email-validator==2.3.0 +exceptiongroup==1.3.0 +fastapi[standard]==0.118.0 +fastapi-cli[standard]==0.0.13 +fastapi-cloud-cli==0.2.1 +fastrlock==0.8.3 +filelock==3.19.1 +frozendict==2.4.6 +frozenlist==1.7.0 +fsspec==2025.9.0 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.35.3 +hypothesis==6.45.0 +idna==3.10 +iniconfig==2.1.0 +interegular==0.3.3 +jinja2==3.1.6 +jiter==0.11.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +markdown-it-py==4.0.0 +markupsafe==3.0.3 +mdurl==0.1.2 +mistral-common[audio,image,opencv,soundfile,soxr]==1.8.5 +mock==5.2.0 +mpmath==1.3.0 +msgpack==1.1.1 +msgspec==0.19.0 +multidict==6.6.4 +networkx==3.4.2 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +openai==2.0.0 +openai-harmony==0.0.4 +opencv-python-headless==4.12.0.88 +opentracing==2.4.0 +outlines-core==0.2.11 +packaging==25.0 +partial-json-parser==0.2.1.1.post6 +pillow==11.3.0 +pluggy==1.6.0 +prometheus-client==0.23.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.2 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pybase64==1.4.2 +pycountry==24.6.1 +pycparser==2.23 +pydantic[email]==2.11.9 +pydantic-core==2.33.2 +pydantic-extra-types[pycountry]==2.10.5 +pygments==2.19.2 +pytest==8.4.2 +pytest-asyncio==0.21.1 +pytest-cov==7.0.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +python-dotenv==1.1.1 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pyyaml==6.0.3 +pyzmq==27.1.0 +ray[cgraph]==2.49.2 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.1.0 +rich-toolkit==0.15.1 +rignore==0.6.4 +rpds-py==0.27.1 +safetensors==0.6.2 +scipy==1.15.3 +sentencepiece==0.2.1 +sentry-sdk==2.39.0 +setproctitle==1.3.7 +shellingham==1.5.4 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.48.0 +sympy==1.14.0 +tiktoken==0.11.0 +tokenizers==0.22.1 +tomli==2.2.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.56.2 +triton==3.4.0 +typer==0.19.2 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +urllib3==2.5.0 +uvicorn[standard]==0.37.0 +uvloop==0.21.0 +vllm==0.10.2 +watchfiles==1.1.0 +websockets==15.0.1 +xformers==0.0.32.post1 +xgrammar==0.1.23 +yarl==1.20.1 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==80.9.0 diff --git a/.riot/requirements/460aab7.txt b/.riot/requirements/460aab7.txt new file mode 100644 index 00000000000..f7bc03da935 --- /dev/null +++ b/.riot/requirements/460aab7.txt @@ -0,0 +1,161 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/460aab7.in +# +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +astor==0.8.1 +attrs==25.3.0 +blake3==1.0.7 +cachetools==6.2.0 +cbor2==5.7.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +compressed-tensors==0.11.0 +coverage[toml]==7.10.7 +cupy-cuda12x==13.6.0 +depyf==0.19.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +einops==0.8.1 +email-validator==2.3.0 +fastapi[standard]==0.118.0 +fastapi-cli[standard]==0.0.13 +fastapi-cloud-cli==0.2.1 +fastrlock==0.8.3 +filelock==3.19.1 +frozendict==2.4.6 +frozenlist==1.7.0 +fsspec==2025.9.0 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.35.3 +hypothesis==6.45.0 +idna==3.10 +iniconfig==2.1.0 +interegular==0.3.3 +jinja2==3.1.6 +jiter==0.11.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +markdown-it-py==4.0.0 +markupsafe==3.0.3 +mdurl==0.1.2 +mistral-common[audio,image,opencv,soundfile,soxr]==1.8.5 +mock==5.2.0 +mpmath==1.3.0 +msgpack==1.1.1 +msgspec==0.19.0 +multidict==6.6.4 +networkx==3.5 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +openai==2.0.0 +openai-harmony==0.0.4 +opencv-python-headless==4.12.0.88 +opentracing==2.4.0 +outlines-core==0.2.11 +packaging==25.0 +partial-json-parser==0.2.1.1.post6 +pillow==11.3.0 +pluggy==1.6.0 +prometheus-client==0.23.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.2 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pybase64==1.4.2 +pycountry==24.6.1 +pycparser==2.23 +pydantic[email]==2.11.9 +pydantic-core==2.33.2 +pydantic-extra-types[pycountry]==2.10.5 +pygments==2.19.2 +pytest==8.4.2 +pytest-asyncio==0.21.1 +pytest-cov==7.0.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +python-dotenv==1.1.1 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pyyaml==6.0.3 +pyzmq==27.1.0 +ray[cgraph]==2.49.2 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.1.0 +rich-toolkit==0.15.1 +rignore==0.6.4 +rpds-py==0.27.1 +safetensors==0.6.2 +scipy==1.16.2 +sentencepiece==0.2.1 +sentry-sdk==2.39.0 +setproctitle==1.3.7 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.48.0 +sympy==1.14.0 +tiktoken==0.11.0 +tokenizers==0.22.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.56.2 +triton==3.4.0 +typer==0.19.2 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +urllib3==2.5.0 +uvicorn[standard]==0.37.0 +uvloop==0.21.0 +vllm==0.10.2 +watchfiles==1.1.0 +websockets==15.0.1 +xformers==0.0.32.post1 +xgrammar==0.1.23 +yarl==1.20.1 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==79.0.1 diff --git a/.riot/requirements/494e77a.txt b/.riot/requirements/494e77a.txt new file mode 100644 index 00000000000..c288c7061a0 --- /dev/null +++ b/.riot/requirements/494e77a.txt @@ -0,0 +1,160 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --cert=None --client-cert=None --index-url=None --no-annotate --pip-args=None .riot/requirements/494e77a.in +# +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +astor==0.8.1 +attrs==25.3.0 +blake3==1.0.7 +cachetools==6.2.0 +cbor2==5.7.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +compressed-tensors==0.11.0 +coverage[toml]==7.10.7 +cupy-cuda12x==13.6.0 +depyf==0.19.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +einops==0.8.1 +email-validator==2.3.0 +fastapi[standard]==0.118.0 +fastapi-cli[standard]==0.0.13 +fastapi-cloud-cli==0.2.1 +fastrlock==0.8.3 +filelock==3.19.1 +frozendict==2.4.6 +frozenlist==1.7.0 +fsspec==2025.9.0 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.35.3 +hypothesis==6.45.0 +idna==3.10 +iniconfig==2.1.0 +interegular==0.3.3 +jinja2==3.1.6 +jiter==0.11.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +markdown-it-py==4.0.0 +markupsafe==3.0.3 +mdurl==0.1.2 +mistral-common[audio,image,opencv,soundfile,soxr]==1.8.5 +mock==5.2.0 +mpmath==1.3.0 +msgpack==1.1.1 +msgspec==0.19.0 +multidict==6.6.4 +networkx==3.5 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +openai==2.0.0 +openai-harmony==0.0.4 +opencv-python-headless==4.12.0.88 +opentracing==2.4.0 +outlines-core==0.2.11 +packaging==25.0 +partial-json-parser==0.2.1.1.post6 +pillow==11.3.0 +pluggy==1.6.0 +prometheus-client==0.23.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.2 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pybase64==1.4.2 +pycountry==24.6.1 +pycparser==2.23 +pydantic[email]==2.11.9 +pydantic-core==2.33.2 +pydantic-extra-types[pycountry]==2.10.5 +pygments==2.19.2 +pytest==8.4.2 +pytest-asyncio==0.21.1 +pytest-cov==7.0.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +python-dotenv==1.1.1 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pyyaml==6.0.3 +pyzmq==27.1.0 +ray[cgraph]==2.49.2 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.1.0 +rich-toolkit==0.15.1 +rignore==0.6.4 +rpds-py==0.27.1 +safetensors==0.6.2 +scipy==1.16.2 +sentencepiece==0.2.1 +sentry-sdk==2.39.0 +setproctitle==1.3.7 +shellingham==1.5.4 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.48.0 +sympy==1.14.0 +tiktoken==0.11.0 +tokenizers==0.22.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.56.2 +triton==3.4.0 +typer==0.19.2 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +urllib3==2.5.0 +uvicorn[standard]==0.37.0 +uvloop==0.21.0 +vllm==0.10.2 +watchfiles==1.1.0 +websockets==15.0.1 +xformers==0.0.32.post1 +xgrammar==0.1.23 +yarl==1.20.1 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==80.9.0 diff --git a/ddtrace/_monkey.py b/ddtrace/_monkey.py index 68f53f66ece..851feb44293 100644 --- a/ddtrace/_monkey.py +++ b/ddtrace/_monkey.py @@ -113,6 +113,7 @@ "anthropic": True, "crewai": True, "pydantic_ai": True, + "vllm": True, "subprocess": True, "unittest": True, "coverage": False, diff --git a/ddtrace/contrib/integration_registry/registry.yaml b/ddtrace/contrib/integration_registry/registry.yaml index 0608780814e..6420de83e24 100644 --- a/ddtrace/contrib/integration_registry/registry.yaml +++ b/ddtrace/contrib/integration_registry/registry.yaml @@ -955,6 +955,16 @@ integrations: min: 0.6.14 max: 0.7.4 +- integration_name: vllm + is_external_package: true + is_tested: true + dependency_names: + - vllm + tested_versions_by_dependency: + vllm: + min: 0.10.2 + max: 0.10.2 + - integration_name: webbrowser is_external_package: false is_tested: true diff --git a/ddtrace/contrib/internal/vllm/__init__.py b/ddtrace/contrib/internal/vllm/__init__.py new file mode 100644 index 00000000000..7dfaf2d3b53 --- /dev/null +++ b/ddtrace/contrib/internal/vllm/__init__.py @@ -0,0 +1,124 @@ +""" +The vLLM integration traces requests through the vLLM V1 engine. + +**Note**: This integration **only supports vLLM V1** (VLLM_USE_V1=1). V0 engine support has been +removed as V0 is deprecated and will be removed in a future vLLM release. + + +Enabling +~~~~~~~~ + +The vLLM integration is enabled automatically when using +:ref:`ddtrace-run` or :func:`patch_all()`. + +Alternatively, use :func:`patch()` to manually enable the integration:: + + from ddtrace import patch + patch(vllm=True) + + +Global Configuration +~~~~~~~~~~~~~~~~~~~~ + +.. py:data:: ddtrace.config.vllm["service"] + + The service name reported by default for vLLM requests. + + This option can also be set with the ``DD_VLLM_SERVICE`` environment variable. + + Default: ``"vllm"`` + + +Instance Configuration +~~~~~~~~~~~~~~~~~~~~~~ + +To configure particular vLLM instances, use the ``Pin`` API:: + + import vllm + from ddtrace import Pin + + Pin.override(vllm, service="my-vllm-service") + + +Architecture +~~~~~~~~~~~~ + +The integration uses **engine-side tracing** to capture all requests regardless of API entry point: + +1. **Model Name Injection** (``LLMEngine.__init__`` / ``AsyncLLM.__init__``): + - Extracts and stores model name for span tagging + - Forces ``log_stats=True`` to enable latency and token metrics collection + +2. **Context Injection** (``Processor.process_inputs``): + - Injects Datadog trace context into ``trace_headers`` + - Context propagates through the engine automatically + +3. **Span Creation** (``OutputProcessor.process_outputs``): + - Creates spans when requests finish + - Extracts data from ``RequestState`` and ``EngineCoreOutput`` + - Decodes prompt from token IDs for chat requests when text is unavailable + - Works for all operations: completion, chat, embedding, cross-encoding + +This design ensures: +- All requests are traced (AsyncLLM, LLM, API server, chat) +- Complete timing and token metrics from engine stats +- Full prompt text capture (including chat conversations) +- Minimal performance overhead + + +Span Tags +~~~~~~~~~ + +All spans are tagged with: + +**Request Information**: +- ``vllm.request.model``: Model name +- ``vllm.request.provider``: ``"vllm"`` + +**Latency Metrics**: +- ``vllm.latency.ttft``: Time to first token (seconds) +- ``vllm.latency.queue``: Queue wait time (seconds) +- ``vllm.latency.prefill``: Prefill phase time (seconds) +- ``vllm.latency.decode``: Decode phase time (seconds) +- ``vllm.latency.inference``: Total inference time (seconds) + +**LLMObs Tags** (when LLMObs is enabled): + +For completion/chat operations: +- ``input_messages``: Prompt text (auto-decoded for chat requests) +- ``output_messages``: Generated text +- ``input_tokens``: Number of input tokens +- ``output_tokens``: Number of generated tokens +- ``temperature``, ``max_tokens``, ``top_p``, ``n``: Sampling parameters +- ``num_cached_tokens``: Number of KV cache hits + +For embedding operations: +- ``input_documents``: Input text or token IDs +- ``output_value``: Embedding shape description +- ``embedding_dim``: Embedding dimension +- ``num_embeddings``: Number of embeddings returned + + +Supported Operations +~~~~~~~~~~~~~~~~~~~~ + +**Async Streaming** (``AsyncLLM``): +- ``generate()``: Text completion +- ``encode()``: Text embedding + +**Offline Batch** (``LLM``): +- ``generate()``: Text completion +- ``chat()``: Multi-turn conversations +- ``encode()``: Text embedding +- ``_cross_encoding_score()``: Cross-encoding scores + +**API Server**: +- All OpenAI-compatible endpoints (automatically traced through engine) + + +Requirements +~~~~~~~~~~~~ + +- vLLM V1 (``VLLM_USE_V1=1``) +- vLLM >= 0.10.2 (for V1 trace header propagation support) +""" diff --git a/ddtrace/contrib/internal/vllm/extractors.py b/ddtrace/contrib/internal/vllm/extractors.py new file mode 100644 index 00000000000..a1b9cbc6e1d --- /dev/null +++ b/ddtrace/contrib/internal/vllm/extractors.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class RequestData: + """Container for vLLM request data extracted from engine outputs.""" + + prompt: Optional[str] = None + input_tokens: int = 0 + output_tokens: int = 0 + output_text: str = "" + finish_reason: Optional[str] = None + embedding_dim: Optional[int] = None + num_embeddings: int = 1 + lora_name: Optional[str] = None + num_cached_tokens: int = 0 + temperature: Optional[float] = None + top_p: Optional[float] = None + n: Optional[int] = None + max_tokens: Optional[int] = None + input_: Optional[list[int]] = None + + +def get_embedding_shape(tensor) -> tuple[int, Optional[int]]: + """Extract (num_embeddings, embedding_dim) from torch.Tensor.""" + if tensor is None or len(tensor.shape) == 0: + return 1, None + + if len(tensor.shape) == 1: + return 1, int(tensor.shape[0]) + + first, last = int(tensor.shape[0]), int(tensor.shape[-1]) + if last == 1: + return 1, first + return first, last + + +def extract_request_data(req_state, engine_core_output) -> RequestData: + """Extract request data from engine-side structures. + + Args: + req_state: RequestState from OutputProcessor.request_states + engine_core_output: EngineCoreOutput from engine_core + + Returns: + RequestData for LLMObs tagging + """ + is_embedding = engine_core_output.pooling_output is not None + + # Get prompt text - if not available, decode from token IDs (but not for embeddings) + prompt_text = req_state.prompt + if not is_embedding and prompt_text is None and req_state.prompt_token_ids and req_state.detokenizer: + tokenizer = getattr(req_state.detokenizer, "tokenizer", None) + if tokenizer: + prompt_text = tokenizer.decode(req_state.prompt_token_ids) + + data = RequestData( + prompt=prompt_text, + input_tokens=req_state.prompt_len or 0, + lora_name=req_state.lora_name, + num_cached_tokens=engine_core_output.num_cached_tokens, + temperature=req_state.temperature, + top_p=req_state.top_p, + n=req_state.n, + max_tokens=req_state.max_tokens_param, + ) + + if engine_core_output.finish_reason: + data.finish_reason = str(engine_core_output.finish_reason) + + if is_embedding: + num_emb, emb_dim = get_embedding_shape(engine_core_output.pooling_output) + data.num_embeddings = num_emb + data.embedding_dim = emb_dim + data.input_ = req_state.prompt_token_ids + else: + # Don't extract output_tokens here - stats not updated yet + # Will be extracted later from captured stats reference + + if req_state.detokenizer: + data.output_text = req_state.detokenizer.output_text + + return data + + +def get_model_name(instance) -> Optional[str]: + """Extract injected model name (set by traced_engine_init)""" + return getattr(instance, "_dd_model_name", None) diff --git a/ddtrace/contrib/internal/vllm/patch.py b/ddtrace/contrib/internal/vllm/patch.py new file mode 100644 index 00000000000..335d0f4a85d --- /dev/null +++ b/ddtrace/contrib/internal/vllm/patch.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +import vllm + +from ddtrace import config +from ddtrace._trace.pin import Pin +from ddtrace.contrib.trace_utils import unwrap +from ddtrace.contrib.trace_utils import with_traced_module +from ddtrace.contrib.trace_utils import wrap +from ddtrace.internal.logger import get_logger +from ddtrace.llmobs._integrations.vllm import VLLMIntegration + +from .extractors import extract_request_data +from .extractors import get_model_name +from .utils import create_span +from .utils import inject_trace_context + + +logger = get_logger(__name__) + +config._add("vllm", {}) + + +@with_traced_module +def traced_engine_init(vllm, pin, func, instance, args, kwargs): + """Inject model name into OutputProcessor and force-enable stats for tracing.""" + # ALWAYS enable stats for tracing - we need req_state.stats.num_generation_tokens + # log_stats is the 3rd positional arg (after vllm_config, executor_class) + if len(args) > 2: + args = args[:2] + (True,) + args[3:] + else: + kwargs["log_stats"] = True + + result = func(*args, **kwargs) + + if hasattr(instance, "model_config") and hasattr(instance, "output_processor"): + model_name = getattr(instance.model_config, "model", None) + if model_name: + instance.output_processor._dd_model_name = model_name + + return result + + +@with_traced_module +def traced_processor_process_inputs(vllm, pin, func, instance, args, kwargs): + """Inject Datadog trace context into trace_headers for propagation.""" + tracer = pin.tracer + + if len(args) > 6: + trace_headers = args[6] + injected_headers = inject_trace_context(tracer, trace_headers) + args = args[:6] + (injected_headers,) + args[7:] + else: + trace_headers = kwargs.get("trace_headers") + kwargs["trace_headers"] = inject_trace_context(tracer, trace_headers) + + return func(*args, **kwargs) + + +@with_traced_module +def traced_output_processor_process_outputs(vllm, pin, func, instance, args, kwargs): + """Create Datadog spans for finished requests.""" + integration = vllm._datadog_integration + + engine_core_outputs = args[0] if args else kwargs.get("engine_core_outputs") + + if not engine_core_outputs: + return func(*args, **kwargs) + + model_name = get_model_name(instance) + + # Capture req_states BEFORE calling func, as func will remove them + spans_data = [] + for engine_core_output in engine_core_outputs: + req_id = engine_core_output.request_id + + if not engine_core_output.finished: + continue + + req_state = instance.request_states.get(req_id) + if not req_state: + continue + + # Extract all data we need before func() removes req_state + arrival_time = req_state.stats.arrival_time if req_state.stats else None + stats = req_state.stats + data = extract_request_data(req_state, engine_core_output) + + spans_data.append( + { + "req_id": req_id, + "trace_headers": engine_core_output.trace_headers, + "arrival_time": arrival_time, + "data": data, + "stats": stats, + } + ) + + # Now call the original function + result = func(*args, **kwargs) + + # Create spans after original function completes + for span_info in spans_data: + span = create_span( + pin=pin, + integration=integration, + model_name=model_name, + trace_headers=span_info["trace_headers"], + arrival_time=span_info["arrival_time"], + ) + + data = span_info["data"] + operation = "embedding" if data.embedding_dim is not None else "completion" + + # Extract output_tokens from stats NOW (after original function updated it) + if operation == "completion" and span_info["stats"]: + data.output_tokens = span_info["stats"].num_generation_tokens + + integration.llmobs_set_tags( + span, + args=[], + kwargs={"request_data": data, "stats": span_info["stats"]}, + response=None, + operation=operation, + ) + + span.finish() + + return result + + +def patch(): + """Patch vLLM V1 library for Datadog tracing.""" + if getattr(vllm, "_datadog_patch", False): + return + + # Check vLLM version - require >= 0.10.2 for V1 trace header propagation + try: + from packaging.version import parse as parse_version + + version_str = getattr(vllm, "__version__", "0.0.0") + base_version = parse_version(version_str).base_version + if parse_version(base_version) < parse_version("0.10.2"): + logger.warning( + "vLLM integration requires vLLM >= 0.10.2 for V1 engine support. " + "Found version %s. Skipping instrumentation.", + version_str, + ) + return + except Exception: + logger.debug("Could not verify vLLM version, proceeding with instrumentation") + + vllm._datadog_patch = True + + Pin().onto(vllm) + integration = VLLMIntegration(integration_config=config.vllm) + vllm._datadog_integration = integration + + wrap("vllm.v1.engine.llm_engine", "LLMEngine.__init__", traced_engine_init(vllm)) + wrap("vllm.v1.engine.async_llm", "AsyncLLM.__init__", traced_engine_init(vllm)) + wrap("vllm.v1.engine.processor", "Processor.process_inputs", traced_processor_process_inputs(vllm)) + wrap( + "vllm.v1.engine.output_processor", + "OutputProcessor.process_outputs", + traced_output_processor_process_outputs(vllm), + ) + + +def unpatch(): + if not getattr(vllm, "_datadog_patch", False): + return + + vllm._datadog_patch = False + + unwrap(vllm.v1.engine.llm_engine.LLMEngine, "__init__") + unwrap(vllm.v1.engine.async_llm.AsyncLLM, "__init__") + unwrap(vllm.v1.engine.processor.Processor, "process_inputs") + unwrap(vllm.v1.engine.output_processor.OutputProcessor, "process_outputs") + + delattr(vllm, "_datadog_integration") diff --git a/ddtrace/contrib/internal/vllm/utils.py b/ddtrace/contrib/internal/vllm/utils.py new file mode 100644 index 00000000000..3e5abba7d04 --- /dev/null +++ b/ddtrace/contrib/internal/vllm/utils.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import Optional + +from ddtrace._trace.pin import Pin +from ddtrace.propagation.http import HTTPPropagator +from ddtrace.trace import Context +from ddtrace.trace import Span + + +def create_span( + pin: Pin, + integration, + model_name: Optional[str], + trace_headers: Optional[dict[str, str]], + arrival_time: Optional[float] = None, +): + """Create a vLLM span with parent context from trace headers.""" + parent_ctx = None + if trace_headers: + parent_ctx = HTTPPropagator.extract(trace_headers) + + span = integration.trace( + pin=pin, + operation_id="vllm.request", + submit_to_llmobs=True, + parent_context=parent_ctx, + model_name=model_name, + ) + + if arrival_time: + span.start_ns = int(arrival_time * 1e9) + + return span + + +def set_latency_metrics(span, stats): + """Set latency metrics from RequestStateStats.""" + if not stats: + return + + if stats.first_token_latency: + span.set_metric("vllm.latency.ttft", float(stats.first_token_latency)) + + queued = stats.queued_ts + scheduled = stats.scheduled_ts + first_token = stats.first_token_ts + last_token = stats.last_token_ts + + if queued and scheduled: + span.set_metric("vllm.latency.queue", float(scheduled - queued)) + + if scheduled and first_token: + span.set_metric("vllm.latency.prefill", float(first_token - scheduled)) + + if first_token and last_token and last_token > first_token: + span.set_metric("vllm.latency.decode", float(last_token - first_token)) + + if scheduled and last_token: + span.set_metric("vllm.latency.inference", float(last_token - scheduled)) + + +def inject_trace_context(tracer, trace_headers: Optional[dict[str, str]]) -> dict[str, str]: + """Inject current trace context into headers for propagation.""" + headers = dict(trace_headers) if trace_headers else {} + + active = tracer.context_provider.active() + if active: + if isinstance(active, Span): + HTTPPropagator.inject(active.context, headers) + elif isinstance(active, Context): + HTTPPropagator.inject(active, headers) + + return headers diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 54ec9b7e263..d5ea0a0bd23 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -44,6 +44,12 @@ CACHE_READ_INPUT_TOKENS_METRIC_KEY = "cache_read_input_tokens" BILLABLE_CHARACTER_COUNT_METRIC_KEY = "billable_character_count" +TIME_TO_FIRST_TOKEN_METRIC_KEY = "time_to_first_token" # nosec B105 +TIME_IN_QUEUE_METRIC_KEY = "time_in_queue" +TIME_IN_MODEL_PREFILL_METRIC_KEY = "time_in_model_prefill" +TIME_IN_MODEL_DECODE_METRIC_KEY = "time_in_model_decode" +TIME_IN_MODEL_INFERENCE_METRIC_KEY = "time_in_model_inference" + EVP_PROXY_AGENT_BASE_PATH = "/evp_proxy/v2" EVAL_ENDPOINT = "/api/intake/llm-obs/v2/eval-metric" SPAN_ENDPOINT = "/api/v2/llmobs" diff --git a/ddtrace/llmobs/_integrations/base.py b/ddtrace/llmobs/_integrations/base.py index 0b0fc312afc..4a3336383a2 100644 --- a/ddtrace/llmobs/_integrations/base.py +++ b/ddtrace/llmobs/_integrations/base.py @@ -59,12 +59,18 @@ def trace(self, pin: Pin, operation_id: str, submit_to_llmobs: bool = False, **k Eventually those should also be internal service spans once peer.service is implemented. """ span_name = kwargs.get("span_name", None) or "{}.request".format(self._integration_name) - span = pin.tracer.trace( + span_type = SpanTypes.LLM if (submit_to_llmobs and self.llmobs_enabled) else None + parent_context = kwargs.get("parent_context") or pin.tracer.context_provider.active() + + span = pin.tracer.start_span( span_name, - resource=operation_id, + child_of=parent_context, service=int_service(pin, self.integration_config), - span_type=SpanTypes.LLM if (submit_to_llmobs and self.llmobs_enabled) else None, + resource=operation_id, + span_type=span_type, + activate=True, ) + log.debug("Creating LLM span with type %s", span.span_type) # determine if the span represents a proxy request base_url = self._get_base_url(**kwargs) diff --git a/ddtrace/llmobs/_integrations/vllm.py b/ddtrace/llmobs/_integrations/vllm.py new file mode 100644 index 00000000000..4d642f7c1cd --- /dev/null +++ b/ddtrace/llmobs/_integrations/vllm.py @@ -0,0 +1,163 @@ +"""LLMObs integration for vLLM V1 library.""" + +from __future__ import annotations + +from typing import Any +from typing import Dict +from typing import List +from typing import Optional + +from ddtrace.contrib.internal.vllm.extractors import RequestData +from ddtrace.llmobs._constants import INPUT_DOCUMENTS +from ddtrace.llmobs._constants import INPUT_MESSAGES +from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY +from ddtrace.llmobs._constants import METADATA +from ddtrace.llmobs._constants import METRICS +from ddtrace.llmobs._constants import MODEL_NAME +from ddtrace.llmobs._constants import MODEL_PROVIDER +from ddtrace.llmobs._constants import OUTPUT_MESSAGES +from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY +from ddtrace.llmobs._constants import OUTPUT_VALUE +from ddtrace.llmobs._constants import SPAN_KIND +from ddtrace.llmobs._constants import TIME_IN_MODEL_DECODE_METRIC_KEY +from ddtrace.llmobs._constants import TIME_IN_MODEL_INFERENCE_METRIC_KEY +from ddtrace.llmobs._constants import TIME_IN_MODEL_PREFILL_METRIC_KEY +from ddtrace.llmobs._constants import TIME_IN_QUEUE_METRIC_KEY +from ddtrace.llmobs._constants import TIME_TO_FIRST_TOKEN_METRIC_KEY +from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY +from ddtrace.llmobs._integrations.base import BaseLLMIntegration +from ddtrace.llmobs.utils import Document +from ddtrace.trace import Span + + +class VLLMIntegration(BaseLLMIntegration): + """LLMObs integration for vLLM V1 library.""" + + _integration_name = "vllm" + + _METADATA_FIELDS = { + "temperature", + "max_tokens", + "top_p", + "n", + "num_cached_tokens", + "embedding_dim", + "finish_reason", + "lora_name", + } + + def _set_base_span_tags(self, span: Span, **kwargs: Any) -> None: + """Set base tags on vLLM spans.""" + model_name = kwargs.get("model_name") + if model_name: + span.set_tag_str("vllm.request.model", model_name) + span.set_tag_str("vllm.request.provider", "vllm") + + def _build_metadata(self, data: RequestData) -> Dict[str, Any]: + """Extract metadata from request data.""" + md: Dict[str, Any] = {} + + for key in self._METADATA_FIELDS: + val = getattr(data, key, None) + if val is not None: + md[key] = val + + return md + + def _build_metrics(self, data: RequestData, stats=None) -> Dict[str, Any]: + """Build token and latency metrics from request data.""" + it = int(data.input_tokens or 0) + ot = int(data.output_tokens or 0) + metrics: Dict[str, Any] = { + INPUT_TOKENS_METRIC_KEY: it, + OUTPUT_TOKENS_METRIC_KEY: ot, + TOTAL_TOKENS_METRIC_KEY: it + ot, + } + + # Add latency metrics if stats are available + if stats: + if stats.first_token_latency: + metrics[TIME_TO_FIRST_TOKEN_METRIC_KEY] = float(stats.first_token_latency) + + queued = stats.queued_ts + scheduled = stats.scheduled_ts + first_token = stats.first_token_ts + last_token = stats.last_token_ts + + if queued and scheduled: + metrics[TIME_IN_QUEUE_METRIC_KEY] = float(scheduled - queued) + + if scheduled and first_token: + metrics[TIME_IN_MODEL_PREFILL_METRIC_KEY] = float(first_token - scheduled) + + if first_token and last_token and last_token > first_token: + metrics[TIME_IN_MODEL_DECODE_METRIC_KEY] = float(last_token - first_token) + + if scheduled and last_token: + metrics[TIME_IN_MODEL_INFERENCE_METRIC_KEY] = float(last_token - scheduled) + + return metrics + + def _build_embedding_context(self, data: RequestData, stats=None) -> Dict[str, Any]: + """Build LLMObs context for embedding operations.""" + ctx: Dict[str, Any] = { + SPAN_KIND: "embedding", + METADATA: self._build_metadata(data), + METRICS: self._build_metrics(data, stats), + } + + docs: List[Document] = [] + if data.prompt: + docs = [Document(text=data.prompt)] + elif data.input_: + docs = [Document(text=str(data.input_))] + + if docs: + ctx[INPUT_DOCUMENTS] = docs + + num_emb = data.num_embeddings + dim = data.embedding_dim + ctx[OUTPUT_VALUE] = ( + f"[{num_emb} embedding(s) returned with size {dim}]" if dim else f"[{num_emb} embedding(s) returned]" + ) + + return ctx + + def _build_completion_context(self, data: RequestData, stats=None) -> Dict[str, Any]: + """Build LLMObs context for completion operations.""" + ctx: Dict[str, Any] = { + SPAN_KIND: "llm", + METADATA: self._build_metadata(data), + METRICS: self._build_metrics(data, stats), + } + + if data.prompt: + ctx[INPUT_MESSAGES] = [{"content": data.prompt}] + + if data.output_text: + ctx[OUTPUT_MESSAGES] = [{"content": data.output_text}] + + return ctx + + def _llmobs_set_tags( + self, + span: Span, + args: List[Any], + kwargs: Dict[str, Any], + response: Optional[Any] = None, + operation: str = "", + ) -> None: + """Set LLMObs tags on span.""" + data: Optional[RequestData] = kwargs.get("request_data") + if data is None: + return + + stats = kwargs.get("stats") + ctx = ( + self._build_embedding_context(data, stats) + if operation == "embedding" + else self._build_completion_context(data, stats) + ) + ctx[MODEL_NAME] = span.get_tag("vllm.request.model") or "" + ctx[MODEL_PROVIDER] = span.get_tag("vllm.request.provider") or "" + span._set_ctx_items(ctx) diff --git a/ddtrace/settings/_config.py b/ddtrace/settings/_config.py index 7becb1dc284..94b0b7d3722 100644 --- a/ddtrace/settings/_config.py +++ b/ddtrace/settings/_config.py @@ -176,6 +176,7 @@ "openai", "crewai", "pydantic_ai", + "vllm", "logging", "cassandra", "boto", diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml new file mode 100644 index 00000000000..a423ce83a5f --- /dev/null +++ b/docker-compose.gpu.yml @@ -0,0 +1,9 @@ +services: + testrunner: + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] diff --git a/docs/integrations.rst b/docs/integrations.rst index c33138a508e..f865e79a606 100644 --- a/docs/integrations.rst +++ b/docs/integrations.rst @@ -633,6 +633,13 @@ Vertica .. _webbrowser: +vLLM +^^^^ +.. automodule:: ddtrace.contrib.internal.vllm + + +.. _vllm: + Webbrowser ^^^^^^^^^^ .. automodule:: ddtrace.contrib.internal.webbrowser diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 19d3e99b2e8..264f3ba52b3 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -225,6 +225,7 @@ posix postgres pre preconfigured +Prefill preload prepend prepended @@ -332,6 +333,7 @@ vendored versioned vertexai vertica +vLLM w3c Webbrowser websocket diff --git a/releasenotes/notes/add-vllm-integration-b93a517daeb45f61.yaml b/releasenotes/notes/add-vllm-integration-b93a517daeb45f61.yaml new file mode 100644 index 00000000000..64b362db1c9 --- /dev/null +++ b/releasenotes/notes/add-vllm-integration-b93a517daeb45f61.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + vllm: Introduces tracing and LLM Observability support for vLLM V1 engine. + Requires vLLM >= 0.10.2. See `the docs `_ + for more information. diff --git a/riotfile.py b/riotfile.py index 534cb666224..9aff6cb6f69 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3366,6 +3366,17 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT }, pys=select_pys(min_version="3.8", max_version="3.13"), ), + Venv( + name="vllm", + command="pytest {cmdargs} tests/contrib/vllm", + pkgs={ + "pytest-asyncio": "==0.21.1", + "pytest-randomly": latest, + "torch": latest, + "vllm": latest, + }, + pys=select_pys(min_version="3.10", max_version="3.12"), + ), Venv( name="valkey", command="pytest {cmdargs} tests/contrib/valkey", diff --git a/scripts/ddtest b/scripts/ddtest index 9bbd5148fe0..50a7f146133 100755 --- a/scripts/ddtest +++ b/scripts/ddtest @@ -9,7 +9,15 @@ then CMD=bash fi -docker compose run \ +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +COMPOSE_FILES=("-f" "$REPO_ROOT/docker-compose.yml") +if command -v nvidia-smi >/dev/null 2>&1; then + COMPOSE_FILES+=("-f" "$REPO_ROOT/docker-compose.gpu.yml") +fi + +docker compose ${COMPOSE_FILES[@]} run \ -e DD_TRACE_AGENT_URL \ --rm \ -i \ diff --git a/scripts/gen_gitlab_config.py b/scripts/gen_gitlab_config.py index 53335c20a79..1436e1a5fc5 100644 --- a/scripts/gen_gitlab_config.py +++ b/scripts/gen_gitlab_config.py @@ -29,10 +29,13 @@ class JobSpec: allow_failure: bool = False paths: t.Optional[t.Set[str]] = None # ignored only: t.Optional[t.Set[str]] = None # ignored + gpu: bool = False def __str__(self) -> str: lines = [] base = f".test_base_{self.runner}" + if self.gpu: + base += "_gpu" if self.snapshot: base += "_snapshot" @@ -121,10 +124,22 @@ def gen_required_suites() -> None: """Generate the list of test suites that need to be run.""" from needs_testrun import extract_git_commit_selections from needs_testrun import for_each_testrun_needed + from ruamel.yaml import YAML import suitespec suites = suitespec.get_suites() + # Load GPU-enabled integrations from registry + registry_path = ROOT / "ddtrace" / "contrib" / "integration_registry" / "registry.yaml" + with YAML() as yaml: + reg = yaml.load(registry_path) or {} + gpu_integrations: t.Set[str] = set() + for entry in reg.get("integrations", []) or []: + if entry.get("gpu") is True: + name = str(entry.get("integration_name", "")).strip() + if name: + gpu_integrations.add(name) + required_suites: t.List[str] = [] for_each_testrun_needed( @@ -156,6 +171,9 @@ def gen_required_suites() -> None: # Store the stage in the suite config for later use suite_config["_stage"] = stage suite_config["_clean_name"] = clean_name + # Mark GPU requirement if the clean suite name matches a GPU integration + if clean_name in gpu_integrations: + suite_config["gpu"] = True # Sort stages: setup first, then alphabetically sorted_stages = ["setup"] + sorted(stages - {"setup"}) diff --git a/supported_versions_output.json b/supported_versions_output.json index 8c73df9b3df..5e34ac2b9f6 100644 --- a/supported_versions_output.json +++ b/supported_versions_output.json @@ -707,6 +707,13 @@ "pinned": "true", "auto-instrumented": true }, + { + "dependency": "vllm", + "integration": "vllm", + "minimum_tracer_supported": "0.10.2", + "max_tracer_supported": "0.10.2", + "auto-instrumented": true + }, { "dependency": "yaaredis", "integration": "yaaredis", diff --git a/supported_versions_table.csv b/supported_versions_table.csv index 7a97f4b71df..6281a37354a 100644 --- a/supported_versions_table.csv +++ b/supported_versions_table.csv @@ -98,4 +98,5 @@ valkey,valkey,6.0.2,6.1.1,True google-cloud-aiplatform,vertexai,1.71.1,1.71.1,True vertexai,vertexai,1.71.1,1.71.1,True vertica-python,vertica *,0.6.14,0.7.4,True +vllm,vllm,0.10.2,0.10.2,True yaaredis,yaaredis,2.0.4,3.0.0,True diff --git a/tests/contrib/vllm/__init__.py b/tests/contrib/vllm/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/contrib/vllm/_utils.py b/tests/contrib/vllm/_utils.py new file mode 100644 index 00000000000..a565d4ecf47 --- /dev/null +++ b/tests/contrib/vllm/_utils.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import gc + +import torch +from vllm.engine.arg_utils import AsyncEngineArgs + + +def create_async_engine(model: str, *, engine_mode: str = "0", **kwargs): + """Create an async engine (V0 or V1) with auto-tuned GPU memory utilization.""" + gpu_util = kwargs.pop("gpu_memory_utilization", None) + gpu_util_candidates = [gpu_util] if gpu_util else [0.1, 0.2, 0.3, 0.5] + + for util in gpu_util_candidates: + try: + args = AsyncEngineArgs(model=model, gpu_memory_utilization=util, **kwargs) + if engine_mode == "1": + from vllm.v1.engine.async_llm import AsyncLLM + + return AsyncLLM.from_engine_args(args) + else: + from vllm.engine.async_llm_engine import AsyncLLMEngine + + return AsyncLLMEngine.from_engine_args(args) + except Exception as exc: + last_error = exc + continue + raise last_error # type: ignore[possibly-unbound] + + +def get_simple_chat_template() -> str: + """Return a simple chat template for testing.""" + return ( + "{% for message in messages %}" + "{% if message['role'] == 'system' %}{{ message['content'] }}\n" + "{% elif message['role'] == 'user' %}User: {{ message['content'] }}\n" + "{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n" + "{% endif %}" + "{% endfor %}" + "Assistant:" + ) + + +def shutdown_cached_llms() -> None: + """Free GPU memory after tests.""" + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/tests/contrib/vllm/api_app.py b/tests/contrib/vllm/api_app.py new file mode 100644 index 00000000000..cada0042442 --- /dev/null +++ b/tests/contrib/vllm/api_app.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from typing import List + +from fastapi import FastAPI +from fastapi import Request +from pydantic import BaseModel +import torch +import torch.nn.functional as F +import vllm + +from ddtrace import tracer as ddtracer +from ddtrace.propagation.http import HTTPPropagator + +from ._utils import create_async_engine + + +class RagRequest(BaseModel): + query: str + documents: List[str] + + +app = FastAPI() + +# Common engine parameters (V1 only) +EMBED_PARAMS = { + "model": "intfloat/e5-small-v2", + "enforce_eager": True, + "max_model_len": 256, + "compilation_config": {"use_inductor": False}, + "trust_remote_code": True, + "gpu_memory_utilization": 0.1, + "runner": "pooling", +} + +GEN_PARAMS = { + "model": "facebook/opt-125m", + "enforce_eager": True, + "max_model_len": 256, + "compilation_config": {"use_inductor": False}, + "trust_remote_code": True, + "gpu_memory_utilization": 0.1, +} + + +async def embed_texts(engine, texts: List[str], base_request_id: str) -> List[torch.Tensor]: + """Embed a list of texts and return their vector representations.""" + pooling_params = vllm.PoolingParams(task="encode") + vectors: List[torch.Tensor] = [] + + for i, text in enumerate(texts): + request_id = f"{base_request_id}-{i}" if len(texts) > 1 else base_request_id + last = None + async for out in engine.encode( + prompt=text, + pooling_params=pooling_params, + request_id=request_id, + ): + last = out + if out.finished: + break + + if last and last.outputs is not None and hasattr(last.outputs, "data"): + emb = last.outputs.data + if emb.dim() > 1: + emb = emb.mean(dim=0) + vectors.append(emb.detach().to("cpu", copy=True).float()) + + return vectors + + +async def generate_text(engine, prompt: str, sampling_params: vllm.SamplingParams, request_id: str) -> str: + """Generate text using the given prompt and sampling parameters.""" + last = None + async for out in engine.generate( + prompt=prompt, + sampling_params=sampling_params, + request_id=request_id, + ): + last = out + if out.finished: + break + + if last and last.outputs: + sample = last.outputs[0] if isinstance(last.outputs, list) and last.outputs else None + if sample and hasattr(sample, "text") and sample.text: + return sample.text + return "" + + +@app.post("/rag") +async def rag(req: RagRequest, request: Request): + """RAG endpoint using vLLM V1 for embedding and text generation.""" + # Activate trace context from client headers if provided + headers = dict(request.headers) + ctx = HTTPPropagator.extract(headers) + if ctx: + ddtracer.context_provider.activate(ctx) + + # Create V1 embedding engine + embed_engine = create_async_engine(**EMBED_PARAMS) + doc_vecs = await embed_texts(embed_engine, req.documents, "embed") + query_vecs = await embed_texts(embed_engine, [req.query], "embed-query") + query_vec = query_vecs[0] if query_vecs else None + + # Find most similar document + top_doc = req.documents[0] + if query_vec is not None and doc_vecs: + sims = [F.cosine_similarity(query_vec.unsqueeze(0), d.unsqueeze(0)).item() for d in doc_vecs] + top_idx = int(max(range(len(sims)), key=lambda i: sims[i])) + top_doc = req.documents[top_idx] + + torch.cuda.empty_cache() + + # Create V1 generation engine + gen_engine = create_async_engine(**GEN_PARAMS) + sampling = vllm.SamplingParams(temperature=0.8, top_p=0.95, max_tokens=64, seed=42) + prompt = f"Context: {top_doc}\nQuestion: {req.query}\nAnswer:" + generated_text = await generate_text(gen_engine, prompt, sampling, "gen-0") + + return {"generated_text": generated_text, "retrieved_document": top_doc} diff --git a/tests/contrib/vllm/conftest.py b/tests/contrib/vllm/conftest.py new file mode 100644 index 00000000000..9f1f5bf69e2 --- /dev/null +++ b/tests/contrib/vllm/conftest.py @@ -0,0 +1,148 @@ +import gc +import weakref + +import pytest +import torch + +from ddtrace._trace.pin import Pin +from ddtrace.contrib.internal.vllm.patch import patch +from ddtrace.contrib.internal.vllm.patch import unpatch +from ddtrace.llmobs import LLMObs as llmobs_service +from tests.llmobs._utils import TestLLMObsSpanWriter +from tests.utils import DummyTracer +from tests.utils import DummyWriter +from tests.utils import override_global_config + +from ._utils import shutdown_cached_llms + + +@pytest.fixture(scope="session", autouse=True) +def _shutdown_cached_llms_session(): + yield + shutdown_cached_llms() + + +@pytest.fixture(autouse=True) +def _per_test_llm_cleanup(): + """Free CUDA memory after each test.""" + yield + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + +@pytest.fixture(autouse=True, scope="session") +def require_gpu(): + """Skip vLLM tests if GPU is not available.""" + if not (hasattr(torch, "cuda") and torch.cuda.is_available()): + pytest.skip("Skipping vLLM tests: GPU not available") + + +@pytest.fixture() +def vllm(): + patch() + import vllm + + yield vllm + unpatch() + + +@pytest.fixture +def mock_tracer(vllm): + pin = Pin.get_from(vllm) + mock_tracer = DummyTracer(writer=DummyWriter(trace_flush_enabled=False)) + pin._override(vllm, tracer=mock_tracer) + yield mock_tracer + + +@pytest.fixture +def llmobs_span_writer(): + yield TestLLMObsSpanWriter(1.0, 5.0, is_agentless=True, _site="datad0g.com") + + +@pytest.fixture +def vllm_llmobs(mock_tracer, llmobs_span_writer): + llmobs_service.disable() + with override_global_config({"_llmobs_ml_app": "", "service": "tests.contrib.vllm"}): + llmobs_service.enable(_tracer=mock_tracer, integrations_enabled=False) + llmobs_service._instance._llmobs_span_writer = llmobs_span_writer + yield llmobs_service + llmobs_service.disable() + + +@pytest.fixture +def llmobs_events(vllm_llmobs, llmobs_span_writer): + return llmobs_span_writer.events + + +@pytest.fixture(scope="module") +def opt_125m_llm(): + """Cached facebook/opt-125m LLM for text generation tests.""" + # Ensure patching happens before LLM creation + from ddtrace.contrib.internal.vllm.patch import patch + + patch() + + import vllm + from vllm.distributed import cleanup_dist_env_and_memory + + llm = vllm.LLM( + model="facebook/opt-125m", + max_model_len=256, + enforce_eager=True, + compilation_config={"use_inductor": False}, + gpu_memory_utilization=0.1, + ) + yield weakref.proxy(llm) + del llm + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def e5_small_llm(): + """Cached intfloat/e5-small LLM for embedding tests.""" + # Ensure patching happens before LLM creation + from ddtrace.contrib.internal.vllm.patch import patch + + patch() + + import vllm + from vllm.distributed import cleanup_dist_env_and_memory + + llm = vllm.LLM( + model="intfloat/e5-small", + runner="pooling", + max_model_len=256, + enforce_eager=True, + compilation_config={"use_inductor": False}, + trust_remote_code=True, + gpu_memory_utilization=0.1, + ) + yield weakref.proxy(llm) + del llm + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def bge_reranker_llm(): + """Cached BAAI/bge-reranker-v2-m3 LLM for classification/ranking tests.""" + # Ensure patching happens before LLM creation + from ddtrace.contrib.internal.vllm.patch import patch + + patch() + + import vllm + from vllm.distributed import cleanup_dist_env_and_memory + + llm = vllm.LLM( + model="BAAI/bge-reranker-v2-m3", + runner="pooling", + max_model_len=256, + enforce_eager=True, + compilation_config={"use_inductor": False}, + trust_remote_code=True, + gpu_memory_utilization=0.1, + ) + yield weakref.proxy(llm) + del llm + cleanup_dist_env_and_memory() diff --git a/tests/contrib/vllm/test_api_app.py b/tests/contrib/vllm/test_api_app.py new file mode 100644 index 00000000000..a4176139443 --- /dev/null +++ b/tests/contrib/vllm/test_api_app.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from fastapi.testclient import TestClient +import pytest + +from ddtrace import tracer as ddtracer +from ddtrace._trace.pin import Pin +from ddtrace.llmobs import LLMObs as llmobs_service +from ddtrace.propagation.http import HTTPPropagator +from tests.utils import override_global_config + +from .api_app import app + + +IGNORE_FIELDS = [ + "meta._dd.p.llmobs_trace_id", +] + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_rag_parent_child(vllm, llmobs_span_writer): + """Test RAG endpoint with parent-child span relationships and LLMObs event capture.""" + # Ensure snapshot writer receives traces: use global tracer for vLLM Pin + pin = Pin.get_from(vllm) + if pin is not None: + pin._override(vllm, tracer=ddtracer) + + # Enable LLMObs on ddtracer with integrations enabled and use test writer + llmobs_service.disable() + with override_global_config({"_llmobs_ml_app": "", "service": "tests.contrib.vllm"}): + llmobs_service.enable(_tracer=ddtracer, integrations_enabled=False) + llmobs_service._instance._llmobs_span_writer = llmobs_span_writer + + # Create a parent span and inject context into headers + with ddtracer.trace("api.rag") as parent_span: + headers = {} + HTTPPropagator.inject(parent_span.context, headers) + + client = TestClient(app) + payload = { + "query": "What is the capital of France?", + "documents": [ + "Paris is the capital and most populous city of France.", + "Berlin is Germany's capital.", + ], + } + + res = client.post("/rag", json=payload, headers=headers) + assert res.status_code == 200 + + llmobs_service.disable() + + # Verify LLMObs events were captured + # Should have events for: embed doc1, embed doc2, embed query, generate text + llmobs_events = llmobs_span_writer.events + assert len(llmobs_events) == 4 + + # Verify we have both embedding and completion operations + span_kinds = [event["meta"]["span"]["kind"] for event in llmobs_events] + assert span_kinds.count("embedding") == 3 # 2 docs + 1 query + assert span_kinds.count("llm") == 1 # 1 generation + + # Check embedding events (order may vary) + embedding_docs = { + "Paris is the capital and most populous city of France.", + "Berlin is Germany's capital.", + "What is the capital of France?", + } + + embedding_events = [e for e in llmobs_events if e["meta"]["span"]["kind"] == "embedding"] + generation_events = [e for e in llmobs_events if e["meta"]["span"]["kind"] == "llm"] + + captured_docs = {e["meta"]["input"]["documents"][0]["text"] for e in embedding_events} + assert captured_docs == embedding_docs + + # Verify all embedding events have correct structure + for event in embedding_events: + assert event["meta"]["model_name"] == "intfloat/e5-small-v2" + assert event["meta"]["model_provider"] == "vllm" + assert event["meta"]["metadata"]["embedding_dim"] == 384 + assert event["meta"]["metadata"]["num_cached_tokens"] == 0 + assert event["metrics"]["input_tokens"] > 0 + assert event["metrics"]["output_tokens"] == 0 + assert "time_to_first_token" in event["metrics"] + assert "time_in_queue" in event["metrics"] + assert "time_in_model_prefill" in event["metrics"] + assert "time_in_model_inference" in event["metrics"] + assert "ml_app:" in event["tags"] + assert "service:tests.contrib.vllm" in event["tags"] + + # Verify generation event has correct structure + assert len(generation_events) == 1 + gen_event = generation_events[0] + assert gen_event["meta"]["model_name"] == "facebook/opt-125m" + assert gen_event["meta"]["model_provider"] == "vllm" + assert gen_event["meta"]["metadata"]["temperature"] == 0.8 + assert gen_event["meta"]["metadata"]["top_p"] == 0.95 + assert gen_event["meta"]["metadata"]["max_tokens"] == 64 + assert gen_event["meta"]["metadata"]["n"] == 1 + assert gen_event["meta"]["metadata"]["num_cached_tokens"] == 0 + assert gen_event["metrics"]["input_tokens"] == 27 + assert gen_event["metrics"]["output_tokens"] > 0 + assert "time_to_first_token" in gen_event["metrics"] + assert "time_in_queue" in gen_event["metrics"] + assert "time_in_model_prefill" in gen_event["metrics"] + assert "time_in_model_decode" in gen_event["metrics"] + assert "time_in_model_inference" in gen_event["metrics"] + assert "ml_app:" in gen_event["tags"] + assert "service:tests.contrib.vllm" in gen_event["tags"] diff --git a/tests/contrib/vllm/test_vllm_llmobs.py b/tests/contrib/vllm/test_vllm_llmobs.py new file mode 100644 index 00000000000..4444cf0ac32 --- /dev/null +++ b/tests/contrib/vllm/test_vllm_llmobs.py @@ -0,0 +1,290 @@ +import mock +import pytest + +from tests.llmobs._utils import _expected_llmobs_llm_span_event + +from ._utils import get_simple_chat_template + + +IGNORE_FIELDS = [] + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_basic(llmobs_events, mock_tracer, opt_125m_llm): + from vllm import SamplingParams + + llm = opt_125m_llm + sampling = SamplingParams(temperature=0.1, top_p=0.9, max_tokens=8, seed=42) + llm.generate("The future of AI is", sampling) + span = mock_tracer.pop_traces()[0][0] + + assert len(llmobs_events) == 1 + expected = _expected_llmobs_llm_span_event( + span, + model_name="facebook/opt-125m", + model_provider="vllm", + input_messages=[{"content": "The future of AI is", "role": ""}], + output_messages=[{"content": " in the hands of the people.", "role": ""}], + metadata={ + "max_tokens": 8, + "n": 1, + "temperature": 0.1, + "top_p": 0.9, + "finish_reason": "length", + "num_cached_tokens": 0, + }, + token_metrics={ + "input_tokens": 6, + "output_tokens": 8, + "total_tokens": 14, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_decode": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert llmobs_events[0] == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_chat(llmobs_events, mock_tracer, opt_125m_llm): + from vllm import SamplingParams + + llm = opt_125m_llm + sampling_params = SamplingParams(seed=42) + + conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, + {"role": "user", "content": "Write an essay about the importance of higher education."}, + ] + + llm.chat(conversation, sampling_params, chat_template=get_simple_chat_template(), use_tqdm=False) + span = mock_tracer.pop_traces()[0][0] + + assert len(llmobs_events) == 1 + expected = _expected_llmobs_llm_span_event( + span, + model_name="facebook/opt-125m", + model_provider="vllm", + input_messages=[ + { + "content": ( + "You are a helpful assistant\nUser: Hello\nAssistant: Hello! How can I assist you today?\n" + "User: Write an essay about the importance of higher education.\nAssistant:" + ), + "role": "", + } + ], + output_messages=[ + { + "content": ( + " Provide lecture information about INTERESTED universities by translating people's " + "ideas into their" + ), + "role": "", + } + ], + metadata={ + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0, + "n": 1, + "finish_reason": "length", + "num_cached_tokens": mock.ANY, + }, + token_metrics={ + "input_tokens": 37, + "output_tokens": 16, + "total_tokens": 53, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_decode": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert llmobs_events[0] == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_classify(llmobs_events, mock_tracer, bge_reranker_llm): + llm = bge_reranker_llm + + prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + llm.classify(prompts) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per input prompt + assert len(llmobs_events) == len(prompts) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + for prompt, event in zip(prompts, llmobs_events): + span = span_by_id[int(event["span_id"])] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="BAAI/bge-reranker-v2-m3", + model_provider="vllm", + input_documents=[{"text": prompt}], + output_value="[1 embedding(s) returned with size 1]", + metadata={"embedding_dim": 1, "num_cached_tokens": 0}, + token_metrics={ + "input_tokens": 7, + "output_tokens": 0, + "total_tokens": 7, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_embed(llmobs_events, mock_tracer, e5_small_llm): + llm = e5_small_llm + + prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + llm.embed(prompts) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per input prompt + assert len(llmobs_events) == len(prompts) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + for prompt, event in zip(prompts, llmobs_events): + span = span_by_id[int(event["span_id"])] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="intfloat/e5-small", + model_provider="vllm", + input_documents=[{"text": prompt}], + output_value="[1 embedding(s) returned with size 384]", + metadata={"embedding_dim": 384, "num_cached_tokens": 0}, + token_metrics={ + "input_tokens": 7, + "output_tokens": 0, + "total_tokens": 7, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_reward(llmobs_events, mock_tracer, bge_reranker_llm): + llm = bge_reranker_llm + + prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + llm.reward(prompts) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per input prompt + assert len(llmobs_events) == len(prompts) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + for prompt, event in zip(prompts, llmobs_events): + span = span_by_id[int(event["span_id"])] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="BAAI/bge-reranker-v2-m3", + model_provider="vllm", + input_documents=[{"text": prompt}], + output_value="[7 embedding(s) returned with size 1024]", + metadata={"embedding_dim": 1024, "num_cached_tokens": 0}, + token_metrics={ + "input_tokens": 7, + "output_tokens": 0, + "total_tokens": 7, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_score(llmobs_events, mock_tracer, bge_reranker_llm): + llm = bge_reranker_llm + + text_1 = "What is the capital of France?" + texts_2 = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + llm.score(text_1, texts_2) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per candidate document + assert len(llmobs_events) == len(texts_2) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + expected_token_metrics_by_text = { + "[0, 4865, 83, 70, 10323, 111, 9942, 32, 2, 2, 581, 10323, 111, 30089, 83, 8233, 399, 5, 2]": { + "input_tokens": 19, + "output_tokens": 0, + "total_tokens": 19, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + "[0, 4865, 83, 70, 10323, 111, 9942, 32, 2, 2, 581, 10323, 111, 9942, 83, 7270, 5, 2]": { + "input_tokens": 18, + "output_tokens": 0, + "total_tokens": 18, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + } + + for event in llmobs_events: + span = span_by_id[int(event["span_id"])] + token_text = event["meta"]["input"]["documents"][0]["text"] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="BAAI/bge-reranker-v2-m3", + model_provider="vllm", + input_documents=[{"text": token_text}], + output_value="[1 embedding(s) returned with size 1]", + metadata={"embedding_dim": 1, "num_cached_tokens": 0}, + token_metrics=expected_token_metrics_by_text[token_text], + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected diff --git a/tests/llmobs/suitespec.yml b/tests/llmobs/suitespec.yml index 57e752e1ea7..bfec7d70e04 100644 --- a/tests/llmobs/suitespec.yml +++ b/tests/llmobs/suitespec.yml @@ -28,6 +28,8 @@ components: - ddtrace/contrib/internal/openai_agents/* pydantic_ai: - ddtrace/contrib/internal/pydantic_ai/* + vllm: + - ddtrace/contrib/internal/vllm/* suites: anthropic: parallelism: 2 @@ -208,3 +210,18 @@ suites: - tests/snapshots/tests.contrib.pydantic_ai.* runner: riot snapshot: true + vllm: + parallelism: 1 + paths: + - '@bootstrap' + - '@core' + - '@tracing' + - '@contrib' + - '@vllm' + - '@llmobs' + - tests/contrib/vllm/* + - tests/snapshots/tests.contrib.vllm.* + runner: riot + gpu: true + snapshot: true + diff --git a/tests/snapshots/tests.contrib.vllm.test_api_app.test_rag_parent_child.json b/tests/snapshots/tests.contrib.vllm.test_api_app.test_rag_parent_child.json new file mode 100644 index 00000000000..a9a0ae8cfb5 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_api_app.test_rag_parent_child.json @@ -0,0 +1,132 @@ +[[ + { + "name": "api.rag", + "service": "tests.contrib.vllm", + "resource": "api.rag", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139394845019214247463221322873009520251", + "_dd.p.tid": "68de787a00000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c" + }, + "metrics": { + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 34973862870, + "start": 1759410298980691304 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 2, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139394846366093010199711342595480305293", + "_dd.p.tid": "68de787a00000000", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "intfloat/e5-small-v2", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "process_id": 5676 + }, + "duration": 328518565, + "start": 1759410315481061632 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 3, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139394846366093010206748569929833385056", + "_dd.p.tid": "68de787a00000000", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "intfloat/e5-small-v2", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "process_id": 5676 + }, + "duration": 10421016, + "start": 1759410315810782464 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 4, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139394846366093010208733316920024243816", + "_dd.p.tid": "68de787a00000000", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "intfloat/e5-small-v2", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "process_id": 5676 + }, + "duration": 9502380, + "start": 1759410315821967872 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 5, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139394847792199935459077289571935634966", + "_dd.p.tid": "68de787a00000000", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "facebook/opt-125m", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "process_id": 5676 + }, + "duration": 312517816, + "start": 1759410333296862208 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_basic.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_basic.json new file mode 100644 index 00000000000..abe14e5f230 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_basic.json @@ -0,0 +1,28 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78bc00000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "facebook/opt-125m", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 118164202, + "start": 1759410364206987008 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_chat.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_chat.json new file mode 100644 index 00000000000..8a2c288a7a3 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_chat.json @@ -0,0 +1,28 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78c800000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "facebook/opt-125m", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 110508762, + "start": 1759410376640131328 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_classify.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_classify.json new file mode 100644 index 00000000000..628a7fcd105 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_classify.json @@ -0,0 +1,56 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78c800000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 24355851, + "start": 1759410376376144128 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78c800000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 24859074, + "start": 1759410376376787200 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_embed.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_embed.json new file mode 100644 index 00000000000..763b8166c4f --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_embed.json @@ -0,0 +1,56 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78c800000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "intfloat/e5-small", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 211406023, + "start": 1759410375903713024 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78c800000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "intfloat/e5-small", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 218747341, + "start": 1759410375904525568 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_reward.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_reward.json new file mode 100644 index 00000000000..9e2e9ed5ecb --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_reward.json @@ -0,0 +1,56 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78b300000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 203040754, + "start": 1759410355140246784 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78b300000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 214339503, + "start": 1759410355141096704 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_score.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_score.json new file mode 100644 index 00000000000..c4f0438328d --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_score.json @@ -0,0 +1,56 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78cb00000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 33020486, + "start": 1759410379465746944 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "68de78cb00000000", + "language": "python", + "runtime-id": "2d7546550d75477f926a4b9a8a91b35c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "process_id": 5676 + }, + "duration": 44709290, + "start": 1759410379466160384 + }]]