From b7a39106a911a03d3ea4e57e710b6945f14d8bff Mon Sep 17 00:00:00 2001 From: Lawrence Fu Date: Wed, 22 Oct 2025 21:57:56 +0000 Subject: [PATCH 1/6] update the Makefile with new recipes --- Makefile | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/Makefile b/Makefile index f2c5dd7..29cacd2 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ CUDA_117_PREFIX := $(REGISTRY_REPO):cuda-11.7- CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8- CUDA_121_PREFIX := $(REGISTRY_REPO):$(UBUNTU_VERSION)-cuda-12.1- CUDA_124_PREFIX := $(REGISTRY_REPO):$(UBUNTU_VERSION)-cuda-12.4- +CUDA_129_PREFIX := $(REGISTRY_REPO):$(UBUNTU_VERSION)-cuda-12.9- ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6- CPU_SUFFIX := -cpu @@ -87,6 +88,7 @@ export GPU_CUDA_117_BASE_NAME := $(CUDA_117_PREFIX)base$(GPU_SUFFIX) export GPU_CUDA_118_BASE_NAME := $(CUDA_118_PREFIX)base$(GPU_SUFFIX) export GPU_CUDA_121_BASE_NAME := $(CUDA_121_PREFIX)base$(GPU_SUFFIX) export GPU_CUDA_124_BASE_NAME := $(CUDA_124_PREFIX)base$(GPU_SUFFIX) +export GPU_CUDA_129_BASE_NAME := $(CUDA_129_PREFIX)base$(GPU_SUFFIX) # Timeout used by packer for AWS operations. Default is 120 (30 minutes) for # waiting for AMI availablity. Bump to 360 attempts = 90 minutes. @@ -252,6 +254,18 @@ build-gpu-cuda-124-base: -o type=image,push=false \ . +.PHONY: build-gpu-cuda-129-base +build-gpu-cuda-129-base: + docker build -f Dockerfile-base-gpu \ + --build-arg BASE_IMAGE="nvidia/cuda:12.9.1-cudnn-devel-$(UBUNTU_VERSION)" \ + --build-arg PYTHON_VERSION="$(PYTHON_VERSION_39)" \ + --build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \ + --build-arg "$(MPI_BUILD_ARG)" \ + -t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_129_BASE_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_129_BASE_NAME)-$(VERSION) \ + -o type=image,push=false \ + . + export CPU_TF1_ENVIRONMENT_NAME := $(CPU_PREFIX_37)pytorch-1.7-tf-1.15$(CPU_SUFFIX) export GPU_TF1_ENVIRONMENT_NAME := $(CUDA_102_PREFIX)pytorch-1.7-tf-1.15$(GPU_SUFFIX) @@ -334,10 +348,12 @@ export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_117_PREFIX)$(PY_39_TAG) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_201 := $(CUDA_118_PREFIX)$(PY_39_TAG)pytorch-2.0.1-gpt-neox-deepspeed$(GPU_SUFFIX) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_210 := $(CUDA_121_PREFIX)$(PY_39_TAG)pytorch-2.1.0-gpt-neox-deepspeed$(GPU_SUFFIX) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_240 := $(CUDA_124_PREFIX)$(PY_39_TAG)pytorch-2.4.0-gpt-neox-deepspeed$(GPU_SUFFIX) +export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_290 := $(CUDA_129_PREFIX)$(PY_39_TAG)pytorch-2.9.0-gpt-neox-deepspeed$(GPU_SUFFIX) export TORCH_PIP_DEEPSPEED_GPU := torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1+cu117 -f https://download.pytorch.org/whl/cu117/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_201 := torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/cu118/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_210 := torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_240 := torch==2.4.0+cu124 torchvision==0.19.0+cu124 torchaudio==2.4.0+cu124 --index-url https://download.pytorch.org/whl/cu124 +export TORCH_PIP_DEEPSPEED_GPU_290 := torch==2.9.0+cu129 torchvision==0.24.0+cu129 torchaudio==2.9.0+cu129 --index-url https://download.pytorch.org/whl/cu129 export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1 # This builds deepspeed environment off of upstream microsoft/DeepSpeed. @@ -373,6 +389,9 @@ augment-torch-210: build-gpt-neox-deepspeed-gpu-torch-210 .PHONY: augment-torch-240 augment-torch-240: build-gpt-neox-deepspeed-gpu-torch-240 +.PHONY: augment-torch-290 +augment-torch-290: build-gpt-neox-deepspeed-gpu-torch-290 + # This builds deepspeed environment off of a patched version of EleutherAI's fork of DeepSpeed # that we need for gpt-neox support. .PHONY: build-gpt-neox-deepspeed-gpu @@ -445,6 +464,20 @@ build-gpt-neox-deepspeed-gpu-torch-240: build-gpu-cuda-124-base -o type=image,push=false \ . +.PHONY: build-gpt-neox-deepspeed-gpu-torch-290 +build-gpt-neox-deepspeed-gpu-torch-290: build-gpu-cuda-129-base + docker build -f Dockerfile-default-gpu \ + --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_129_BASE_NAME)-$(SHORT_GIT_HASH)" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU_290)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0" \ + --build-arg DET_BUILD_NCCL="" \ + --build-arg DEEPSPEED_PIP="git+https://github.com/augmentcode/DeeperSpeed.git@d08ec4e806ace0721026dd83067ca43ddc697e15" \ + -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_290)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_290)-$(VERSION) \ + -o type=image,push=false \ + . + ifeq ($(NGC_PUBLISH),) define CPU_TF28_TAGS -t $(DOCKERHUB_REGISTRY)/$(CPU_TF28_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ From c1c93c47cdb8a0fd6f9b86b5872d0decb5fba360 Mon Sep 17 00:00:00 2001 From: Lawrence Fu Date: Thu, 23 Oct 2025 17:29:31 +0000 Subject: [PATCH 2/6] fix the pytorch version to 2.8.0 --- Makefile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 29cacd2..a187a67 100644 --- a/Makefile +++ b/Makefile @@ -348,12 +348,12 @@ export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_117_PREFIX)$(PY_39_TAG) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_201 := $(CUDA_118_PREFIX)$(PY_39_TAG)pytorch-2.0.1-gpt-neox-deepspeed$(GPU_SUFFIX) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_210 := $(CUDA_121_PREFIX)$(PY_39_TAG)pytorch-2.1.0-gpt-neox-deepspeed$(GPU_SUFFIX) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_240 := $(CUDA_124_PREFIX)$(PY_39_TAG)pytorch-2.4.0-gpt-neox-deepspeed$(GPU_SUFFIX) -export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_290 := $(CUDA_129_PREFIX)$(PY_39_TAG)pytorch-2.9.0-gpt-neox-deepspeed$(GPU_SUFFIX) +export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_280 := $(CUDA_129_PREFIX)$(PY_39_TAG)pytorch-2.8.0-gpt-neox-deepspeed$(GPU_SUFFIX) export TORCH_PIP_DEEPSPEED_GPU := torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1+cu117 -f https://download.pytorch.org/whl/cu117/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_201 := torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/cu118/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_210 := torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_240 := torch==2.4.0+cu124 torchvision==0.19.0+cu124 torchaudio==2.4.0+cu124 --index-url https://download.pytorch.org/whl/cu124 -export TORCH_PIP_DEEPSPEED_GPU_290 := torch==2.9.0+cu129 torchvision==0.24.0+cu129 torchaudio==2.9.0+cu129 --index-url https://download.pytorch.org/whl/cu129 +export TORCH_PIP_DEEPSPEED_GPU_280 := torch==2.8.0+cu129 torchvision==0.24.0+cu129 torchaudio==2.8.0+cu129 --index-url https://download.pytorch.org/whl/cu129 export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1 # This builds deepspeed environment off of upstream microsoft/DeepSpeed. @@ -389,8 +389,8 @@ augment-torch-210: build-gpt-neox-deepspeed-gpu-torch-210 .PHONY: augment-torch-240 augment-torch-240: build-gpt-neox-deepspeed-gpu-torch-240 -.PHONY: augment-torch-290 -augment-torch-290: build-gpt-neox-deepspeed-gpu-torch-290 +.PHONY: augment-torch-280 +augment-torch-280: build-gpt-neox-deepspeed-gpu-torch-280 # This builds deepspeed environment off of a patched version of EleutherAI's fork of DeepSpeed # that we need for gpt-neox support. @@ -464,17 +464,17 @@ build-gpt-neox-deepspeed-gpu-torch-240: build-gpu-cuda-124-base -o type=image,push=false \ . -.PHONY: build-gpt-neox-deepspeed-gpu-torch-290 -build-gpt-neox-deepspeed-gpu-torch-290: build-gpu-cuda-129-base +.PHONY: build-gpt-neox-deepspeed-gpu-torch-280 +build-gpt-neox-deepspeed-gpu-torch-280: build-gpu-cuda-129-base docker build -f Dockerfile-default-gpu \ --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_129_BASE_NAME)-$(SHORT_GIT_HASH)" \ - --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU_290)" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU_280)" \ --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ --build-arg TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0" \ --build-arg DET_BUILD_NCCL="" \ --build-arg DEEPSPEED_PIP="git+https://github.com/augmentcode/DeeperSpeed.git@d08ec4e806ace0721026dd83067ca43ddc697e15" \ - -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_290)-$(SHORT_GIT_HASH) \ - -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_290)-$(VERSION) \ + -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_280)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_280)-$(VERSION) \ -o type=image,push=false \ . From 825bb6d0bb631480bb0c9e6644f7d9342dfc0e5d Mon Sep 17 00:00:00 2001 From: Lawrence Fu Date: Thu, 23 Oct 2025 17:45:38 +0000 Subject: [PATCH 3/6] builds up until deepspeed install --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a187a67..8a32164 100644 --- a/Makefile +++ b/Makefile @@ -353,7 +353,7 @@ export TORCH_PIP_DEEPSPEED_GPU := torch==1.13.1+cu117 torchvision==0.14.1+cu117 export TORCH_PIP_DEEPSPEED_GPU_201 := torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/cu118/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_210 := torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_240 := torch==2.4.0+cu124 torchvision==0.19.0+cu124 torchaudio==2.4.0+cu124 --index-url https://download.pytorch.org/whl/cu124 -export TORCH_PIP_DEEPSPEED_GPU_280 := torch==2.8.0+cu129 torchvision==0.24.0+cu129 torchaudio==2.8.0+cu129 --index-url https://download.pytorch.org/whl/cu129 +export TORCH_PIP_DEEPSPEED_GPU_280 := torch==2.8.0+cu129 torchvision==0.23.0+cu129 torchaudio==2.8.0+cu129 --index-url https://download.pytorch.org/whl/cu129 export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1 # This builds deepspeed environment off of upstream microsoft/DeepSpeed. From e65f18fe00fceecc412d46b576f58f0ab84aa37d Mon Sep 17 00:00:00 2001 From: Lawrence Fu Date: Thu, 23 Oct 2025 17:55:33 +0000 Subject: [PATCH 4/6] build completes --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8a32164..bb418a4 100644 --- a/Makefile +++ b/Makefile @@ -470,7 +470,7 @@ build-gpt-neox-deepspeed-gpu-torch-280: build-gpu-cuda-129-base --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_129_BASE_NAME)-$(SHORT_GIT_HASH)" \ --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU_280)" \ --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ - --build-arg TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0" \ + --build-arg TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0" \ --build-arg DET_BUILD_NCCL="" \ --build-arg DEEPSPEED_PIP="git+https://github.com/augmentcode/DeeperSpeed.git@d08ec4e806ace0721026dd83067ca43ddc697e15" \ -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_280)-$(SHORT_GIT_HASH) \ From dc3a1dabb0ba3c7fb8d514ff63e7f10638f4e934 Mon Sep 17 00:00:00 2001 From: Lawrence Fu Date: Thu, 23 Oct 2025 21:46:04 +0000 Subject: [PATCH 5/6] upgrade to python 3.11.7 --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bb418a4..2199587 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,7 @@ PYTHON_VERSION_37 := 3.7.11 PYTHON_VERSION_38 := 3.8.12 PYTHON_VERSION_39 := 3.9.16 PYTHON_VERSION_310 := 3.10.12 +PYTHON_VERSION_311 := 3.11.7 PY_37_TAG := py-3.7- PY_38_TAG := py-3.8- PY_39_TAG := py-3.9- @@ -258,7 +259,7 @@ build-gpu-cuda-124-base: build-gpu-cuda-129-base: docker build -f Dockerfile-base-gpu \ --build-arg BASE_IMAGE="nvidia/cuda:12.9.1-cudnn-devel-$(UBUNTU_VERSION)" \ - --build-arg PYTHON_VERSION="$(PYTHON_VERSION_39)" \ + --build-arg PYTHON_VERSION="$(PYTHON_VERSION_311)" \ --build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \ --build-arg "$(MPI_BUILD_ARG)" \ -t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_129_BASE_NAME)-$(SHORT_GIT_HASH) \ From 102e040152ba0b914822136846d3a6c0cea0cdf1 Mon Sep 17 00:00:00 2001 From: Lawrence Fu Date: Thu, 30 Oct 2025 23:32:48 +0000 Subject: [PATCH 6/6] fix tag --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f2645a8..5ce465c 100644 --- a/Makefile +++ b/Makefile @@ -369,7 +369,7 @@ export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_201 := $(CUDA_118_PREFIX)$(PY_39_ export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_210 := $(CUDA_121_PREFIX)$(PY_39_TAG)pytorch-2.1.0-gpt-neox-deepspeed$(GPU_SUFFIX) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_240 := $(CUDA_124_PREFIX)$(PY_39_TAG)pytorch-2.4.0-gpt-neox-deepspeed$(GPU_SUFFIX) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_270 := $(CUDA_128_PREFIX)$(PY_311_TAG)pytorch-2.7.0-gpt-neox-deepspeed$(GPU_SUFFIX) -export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_280 := $(CUDA_129_PREFIX)$(PY_39_TAG)pytorch-2.8.0-gpt-neox-deepspeed$(GPU_SUFFIX) +export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_280 := $(CUDA_129_PREFIX)$(PY_311_TAG)pytorch-2.8.0-gpt-neox-deepspeed$(GPU_SUFFIX) export TORCH_PIP_DEEPSPEED_GPU := torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1+cu117 -f https://download.pytorch.org/whl/cu117/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_201 := torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/cu118/torch_stable.html export TORCH_PIP_DEEPSPEED_GPU_210 := torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html