From 3ac21b04c4abdcb16d4a53a93b529bcf513d2b9a Mon Sep 17 00:00:00 2001 From: James Tayler Date: Mon, 20 Nov 2023 22:50:34 +1300 Subject: [PATCH 1/3] run tests in docker with cuda execution provider --- Dockerfile | 38 +++++++++++++++++-- .../OnnxStack.IntegrationTests.csproj | 2 +- OnnxStack.IntegrationTests/appsettings.json | 4 +- OnnxStackCore.sln | 1 + docker-compose.yml | 7 ++++ run-docker-tests.sh | 2 + 6 files changed, 48 insertions(+), 6 deletions(-) create mode 100755 run-docker-tests.sh diff --git a/Dockerfile b/Dockerfile index 1f5376dc..232cbd5a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,10 @@ -FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build +# Since we're using the nvidia/cuda base image, this requires nvidia-container-toolkit installed on the host system to pass through the drivers to the container. +# see: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +FROM nvidia/cuda:12.3.0-runtime-ubuntu22.04 AS final WORKDIR /app # Install Git and Git LFS -RUN apt-get update && apt-get install -y curl +RUN apt-get update && apt-get install -y curl wget RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && apt-get install -y git-lfs # Clone the Stable Diffusion 1.5 base model @@ -11,7 +13,37 @@ RUN git clone https://huggingface.co/runwayml/stable-diffusion-v1-5 -b onnx # Clone the LCM Dreamshaper V7 model RUN git clone https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX +#need to install NVIDIA's gpg key before apt search will show up to date packages for cuda +RUN wget -N -t 5 -T 10 http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \ + && dpkg -i ./cuda-keyring_1.1-1_all.deb + +# install CUDA dependencies required according to `ldd libonnxruntime_providers_cuda.so` +RUN apt-get update \ + && apt-get install -y libcublaslt11 libcublas11 libcudnn8=8.9.1.23-1+cuda11.8 libcufft10 libcudart11.0 + +# According to `ldd libortextensions.so` it depends on ssl 1.1 to run, and the dotnet/runtime-deps base image installs it which is why it works inside the dotnet base images. +# Since we need access to the GPU to use the CUDA execution provider we need to use the nvidia/cuda base image instead. +# The nvidia/cuda base image doesn't contain SSL 1.1, hence we have to manually install it like this ot satisfy the dependency. +# This fixes the "The ONNX Runtime extensions library was not found" error. +# See: https://stackoverflow.com/questions/72133316/libssl-so-1-1-cannot-open-shared-object-file-no-such-file-or-directory +RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2.20_amd64.deb && dpkg -i libssl1.1_1.1.1f-1ubuntu2.20_amd64.deb + +# Need to install dotnet sdk since we're not using the dotnet/sdk base image. +# Note: icu is also installed to help with globalization https://learn.microsoft.com/en-us/dotnet/core/extensions/globalization-icu +RUN apt-get update \ + && apt-get install -y dotnet-sdk-7.0 icu-devtools + +ENV \ + # Enable detection of running in a container + DOTNET_RUNNING_IN_CONTAINER=true \ + # Do not generate certificate + DOTNET_GENERATE_ASPNET_CERTIFICATE=false \ + # Do not show first run text + DOTNET_NOLOGO=true \ + # Skip extraction of XML docs - generally not useful within an image/container - helps performance + NUGET_XMLDOC_MODE=skip + COPY . . RUN dotnet build OnnxStackCore.sln -ENTRYPOINT ["dotnet", "test", "OnnxStackCore.sln"] \ No newline at end of file +ENTRYPOINT ["sh", "-c", "nvidia-smi && dotnet test OnnxStackCore.sln"] \ No newline at end of file diff --git a/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj b/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj index 0d8a711a..cdd5a81d 100644 --- a/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj +++ b/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj @@ -15,8 +15,8 @@ - + diff --git a/OnnxStack.IntegrationTests/appsettings.json b/OnnxStack.IntegrationTests/appsettings.json index f12635cf..3ba41c45 100644 --- a/OnnxStack.IntegrationTests/appsettings.json +++ b/OnnxStack.IntegrationTests/appsettings.json @@ -24,7 +24,7 @@ "InterOpNumThreads": 0, "IntraOpNumThreads": 0, "ExecutionMode": "ORT_SEQUENTIAL", - "ExecutionProvider": "Cpu", + "ExecutionProvider": "Cuda", "ModelConfigurations": [ { "Type": "Tokenizer", @@ -65,7 +65,7 @@ "InterOpNumThreads": 0, "IntraOpNumThreads": 0, "ExecutionMode": "ORT_SEQUENTIAL", - "ExecutionProvider": "Cpu", + "ExecutionProvider": "Cuda", "ModelConfigurations": [ { "Type": "Tokenizer", diff --git a/OnnxStackCore.sln b/OnnxStackCore.sln index 810ecdef..775ebc70 100644 --- a/OnnxStackCore.sln +++ b/OnnxStackCore.sln @@ -13,6 +13,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SolutionItems", "SolutionIt .gitignore = .gitignore docker-compose.yml = docker-compose.yml README.md = README.md + run-docker-tests.sh = run-docker-tests.sh EndProjectSection EndProject Global diff --git a/docker-compose.yml b/docker-compose.yml index 2f93a27f..e02e227a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,5 +3,12 @@ version: '3.7' services: app: build: . + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] volumes: - "./docker-test-output:/app/OnnxStack.IntegrationTests/bin/Debug/net7.0/images" \ No newline at end of file diff --git a/run-docker-tests.sh b/run-docker-tests.sh new file mode 100755 index 00000000..c4c82b86 --- /dev/null +++ b/run-docker-tests.sh @@ -0,0 +1,2 @@ +#! /bin/bash +docker-compose up --build \ No newline at end of file From b1d23584152b02a63fd889aa22c0f6d46ab8b4ff Mon Sep 17 00:00:00 2001 From: James Tayler Date: Mon, 20 Nov 2023 23:25:34 +1300 Subject: [PATCH 2/3] ensure tests run sequentially --- OnnxStack.IntegrationTests/IntegrationTestCollection.cs | 7 +++++++ OnnxStack.IntegrationTests/StableDiffusionTests.cs | 8 ++------ OnnxStack.IntegrationTests/Usings.cs | 5 ++++- OnnxStackCore.sln | 2 +- run-docker-tests.sh | 2 -- run-integration-tests-cuda.sh | 7 +++++++ 6 files changed, 21 insertions(+), 10 deletions(-) create mode 100644 OnnxStack.IntegrationTests/IntegrationTestCollection.cs delete mode 100755 run-docker-tests.sh create mode 100755 run-integration-tests-cuda.sh diff --git a/OnnxStack.IntegrationTests/IntegrationTestCollection.cs b/OnnxStack.IntegrationTests/IntegrationTestCollection.cs new file mode 100644 index 00000000..ccd0d7bb --- /dev/null +++ b/OnnxStack.IntegrationTests/IntegrationTestCollection.cs @@ -0,0 +1,7 @@ +namespace OnnxStack.IntegrationTests; + +/// +/// All integration tests need to go in a single collection, so tests in different classes run sequentially and not in parallel. +/// +[CollectionDefinition("IntegrationTests")] +public class IntegrationTestCollection { } \ No newline at end of file diff --git a/OnnxStack.IntegrationTests/StableDiffusionTests.cs b/OnnxStack.IntegrationTests/StableDiffusionTests.cs index 161c7ed9..73d0d9fd 100644 --- a/OnnxStack.IntegrationTests/StableDiffusionTests.cs +++ b/OnnxStack.IntegrationTests/StableDiffusionTests.cs @@ -1,4 +1,3 @@ -using System.Security.Cryptography; using FluentAssertions; using FluentAssertions.Execution; using Microsoft.Extensions.DependencyInjection; @@ -13,13 +12,10 @@ namespace OnnxStack.IntegrationTests; /// -/// These tests just run on CPU execution provider for now, but could switch it to CUDA and run on GPU -/// if the necessary work is done to setup the docker container to allow GPU passthrough to the container. -/// See https://blog.roboflow.com/use-the-gpu-in-docker/ for an example of how to do this. -/// -/// Can then also setup a self-hosted runner in Github Actions to run the tests on your own GPU as part of the CI/CD pipeline. +/// These tests could be run via a self-hosted runner in Github Actions to run the tests on your own GPU as part of the CI/CD pipeline. /// Maybe something like https://www.youtube.com/watch?v=rVq-SCNyxVc /// +[Collection("IntegrationTests")] public class StableDiffusionTests { private readonly IStableDiffusionService _stableDiffusion; diff --git a/OnnxStack.IntegrationTests/Usings.cs b/OnnxStack.IntegrationTests/Usings.cs index 8c927eb7..9af5aa1a 100644 --- a/OnnxStack.IntegrationTests/Usings.cs +++ b/OnnxStack.IntegrationTests/Usings.cs @@ -1 +1,4 @@ -global using Xunit; \ No newline at end of file +global using Xunit; + +// need all tests to run one at a time sequentially to not overwhelm the GPU +[assembly: CollectionBehavior(DisableTestParallelization = true)] \ No newline at end of file diff --git a/OnnxStackCore.sln b/OnnxStackCore.sln index 775ebc70..f691cc93 100644 --- a/OnnxStackCore.sln +++ b/OnnxStackCore.sln @@ -13,7 +13,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SolutionItems", "SolutionIt .gitignore = .gitignore docker-compose.yml = docker-compose.yml README.md = README.md - run-docker-tests.sh = run-docker-tests.sh + run-integration-tests-cuda.sh = run-integration-tests-cuda.sh EndProjectSection EndProject Global diff --git a/run-docker-tests.sh b/run-docker-tests.sh deleted file mode 100755 index c4c82b86..00000000 --- a/run-docker-tests.sh +++ /dev/null @@ -1,2 +0,0 @@ -#! /bin/bash -docker-compose up --build \ No newline at end of file diff --git a/run-integration-tests-cuda.sh b/run-integration-tests-cuda.sh new file mode 100755 index 00000000..9cc0a5e1 --- /dev/null +++ b/run-integration-tests-cuda.sh @@ -0,0 +1,7 @@ +#! /bin/bash +# running this requires: +# - nvidia GPU with sufficient VRAM +# - nvidia drivers installed on the host system +# - nvidia-container-toolkit installed on the host system (see: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +# - nvidia-smi also reports peak VRAM close 24GB while running the tests +docker-compose up --build \ No newline at end of file From e2a82e13b1c5203cfe235feb3110894b5f11f87f Mon Sep 17 00:00:00 2001 From: James Tayler Date: Tue, 21 Nov 2023 00:19:23 +1300 Subject: [PATCH 3/3] add troubleshooting instructions and remove unused dependency --- .../OnnxStack.IntegrationTests.csproj | 2 +- README.md | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj b/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj index cdd5a81d..250812fb 100644 --- a/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj +++ b/OnnxStack.IntegrationTests/OnnxStack.IntegrationTests.csproj @@ -15,7 +15,7 @@ - + diff --git a/README.md b/README.md index c0556e24..30813e74 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,21 @@ Other `Microsoft.ML.OnnxRuntime.*` executors like `Cuda` may work but are untest `DirectML` > 10GB VRAM - +## Troubleshooting + + - I'm running on linux but it's not working citing:`The ONNX Runtime extensions library was not found`? + - It's having a problem loading `libortextensions.so` + - From the project root run `find -name "libortextensions.so"` to locate that file + - Then run `ldd libortextensions.so` against it to see what dependencies it needs versus what your system has. + - It has a dependency on SSL 1.1 which was removed from Ubuntu based OSes and causes this error. + - It can be remedied by manually installing the dependency. + - See: https://stackoverflow.com/questions/72133316/libssl-so-1-1-cannot-open-shared-object-file-no-such-file-or-directory + - I've installed `Microsoft.ML.OnnxRuntime` and `Microsoft.ML.OnnxRuntime.Gpu` into my project and set the execution provider to `Cuda`, but it's complaining it can't find an entry point for CUDA? + - `System.EntryPointNotFoundException : Unable to find an entry point named 'OrtSessionOptionsAppendExecutionProvider_CUDA' in shared library 'onnxruntime'` + - Adding both `Microsoft.ML.OnnxRuntime` AND `Microsoft.ML.OnnxRuntime.Gpu` at the same time causes this. + - Remove `Microsoft.ML.OnnxRuntime` and try again. + - I'm trying to run via CUDA execution provider but it's complaining about missing `libcublaslt11`, `libcublas11`, or `libcudnn8`? + - Aside from just the NVIDIA Drivers you also need to install CUDA, and cuDNN. ## Contribution