Skip to content

Commit cd80805

Browse files
[CI] Add GPU selector for runner (#148)
* Update run_tests.yml * Update ip * clear cache * fix test not run * fix no tests * Update run_tests.yml
1 parent 87a4644 commit cd80805

File tree

1 file changed

+39
-123
lines changed

1 file changed

+39
-123
lines changed

.github/workflows/run_tests.yml

Lines changed: 39 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
ref: ${{ github.event.inputs.ref }}
3030

3131
- name: Compile
32+
shell: bash
3233
run: python setup.py bdist_wheel
3334

3435
- name: Show dist folder
@@ -45,145 +46,60 @@ jobs:
4546
runs-on: self-hosted
4647
container:
4748
image: modelcloud/gptqmodel:github-ci-v1
49+
strategy:
50+
fail-fast: false
51+
matrix:
52+
version: [ "test_perplexity.py", "test_lm_head.py", "test_q4_exallama.py", "test_q4_exallama_v2.py", "test_q4_marlin.py", "test_q4_triton.py", "test_repacking.py", "test_serialization.py", "test_sharded.py", "test_triton.py", "test_quant_formats.py", "test_q4_cuda.py", "test_q4_bitblas.py" ]
53+
4854
steps:
55+
- name: Checkout Codes
56+
uses: actions/checkout@v4
57+
with:
58+
repository: ${{ github.event.inputs.repo }}
59+
ref: ${{ github.event.inputs.ref }}
60+
61+
- name: Show folder
62+
run: |
63+
ls -alh . || true
64+
ls -alh dist || true
65+
rm -rf dist/* || true
66+
4967
- name: Download artifact
5068
uses: actions/download-artifact@v4
5169
with:
5270
name: dist
5371
path: dist
5472

5573
- name: Show dist folder
56-
run: ls -alh dist
74+
run: ls -alh dist || true
5775

5876
- name: Install wheel
77+
shell: bash
5978
run: |
60-
# install only the last version
6179
pip install dist/*.whl
6280
6381
- name: Find suitable GPU
64-
run: |
65-
suitable_gpu=$(nvidia-smi -L | grep -E '4090' | awk -F': ' '{print $1}' | sed 's/GPU //g' | while read gpu_id
66-
do
67-
mem_total=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i $gpu_id)
68-
mem_used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $gpu_id)
69-
mem_used_pct=$((100 * mem_used / mem_total))
70-
if [ $mem_used_pct -lt 2 ]; then # 2 -> 98% free
71-
echo $gpu_id
72-
break
73-
fi
74-
done)
75-
if [ -z "$suitable_gpu" ]; then
76-
echo "No suitable GPU found. Exiting with error."
77-
exit 1
78-
else
79-
echo "CUDA_VISIBLE_DEVICES=$suitable_gpu" >> $GITHUB_ENV
80-
echo "CUDA_VISIBLE_DEVICES set to $suitable_gpu"
81-
fi
82-
83-
- name: Run test_perplexity.py
84-
id: test_perplexity
85-
continue-on-error: true
86-
run: pytest --durations=0 tests/test_perplexity.py
87-
88-
- name: Run test_lm_head.py
89-
id: test_lm_head
90-
continue-on-error: true
91-
run: pytest --durations=0 tests/test_lm_head.py
92-
93-
- name: Run test_q4_exallama.py
94-
id: test_q4_exallama
95-
continue-on-error: true
96-
run: pytest --durations=0 tests/test_q4_exallama.py
97-
98-
- name: Run test_q4_exallama_v2.py
99-
id: test_q4_exallama_v2
100-
continue-on-error: true
101-
run: pytest --durations=0 tests/test_q4_exallama_v2.py
102-
103-
- name: Run test_q4_marlin.py
104-
id: test_q4_marlin
105-
continue-on-error: true
106-
run: pytest --durations=0 tests/test_q4_marlin.py
107-
108-
- name: Run test_q4_triton.py
109-
id: test_q4_triton
110-
continue-on-error: true
111-
run: pytest --durations=0 tests/test_q4_triton.py
112-
113-
- name: Run test_repacking.py
114-
id: test_repacking
115-
continue-on-error: true
116-
run: pytest --durations=0 tests/test_repacking.py
117-
118-
- name: Run test_serialization.py
119-
id: test_serialization
120-
continue-on-error: true
121-
run: pytest --durations=0 tests/test_serialization.py
122-
123-
- name: Run test_sharded.py
124-
id: test_sharded
125-
continue-on-error: true
126-
run: pytest --durations=0 tests/test_sharded.py
127-
128-
- name: Run test_triton.py
129-
id: test_triton
130-
continue-on-error: true
131-
run: pytest --durations=0 tests/test_triton.py
132-
133-
- name: Run test_quant_formats.py
134-
id: test_quant_formats
135-
continue-on-error: true
136-
run: pytest --durations=0 tests/test_quant_formats.py
137-
138-
- name: Run test_q4_cuda.py
139-
id: test_q4_cuda
140-
continue-on-error: true
141-
run: pytest --durations=0 tests/test_q4_cuda.py
142-
143-
- name: Run test_q4_bitblas.py
144-
id: test_q4_bitblas
145-
continue-on-error: true
146-
run: pytest --durations=0 tests/test_q4_bitblas.py
147-
148-
- name: Print results
14982
shell: bash
15083
run: |
151-
declare -A step_outcomes
152-
step_outcomes=(
153-
[test_perplexity]="${{ steps.test_perplexity.outcome }}"
154-
[test_lm_head]="${{ steps.test_lm_head.outcome }}"
155-
[test_q4_exallama]="${{ steps.test_q4_exallama.outcome }}"
156-
[test_q4_exallama_v2]="${{ steps.test_q4_exallama_v2.outcome }}"
157-
[test_q4_marlin]="${{ steps.test_q4_marlin.outcome }}"
158-
[test_q4_triton]="${{ steps.test_q4_triton.outcome }}"
159-
[test_repacking]="${{ steps.test_repacking.outcome }}"
160-
[test_serialization]="${{ steps.test_serialization.outcome }}"
161-
[test_sharded]="${{ steps.test_sharded.outcome }}"
162-
[test_triton]="${{ steps.test_triton.outcome }}"
163-
[test_quant_formats]="${{ steps.test_quant_formats.outcome }}"
164-
[test_q4_cuda]="${{ steps.test_q4_cuda.outcome }}"
165-
[test_q4_bitblas]="${{ steps.test_q4_bitblas.outcome }}"
166-
)
167-
168-
max_length=0
169-
for step in "${!step_outcomes[@]}"; do
170-
length=${#step}
171-
if [[ $length -gt $max_length ]]; then
172-
max_length=$length
173-
fi
174-
done
175-
176-
error_occurred=0
177-
for step in "${!step_outcomes[@]}"; do
178-
outcome="${step_outcomes[$step]}"
179-
if [ "$outcome" == "success" ]; then
180-
printf "\e[32m%-*s Result: %s\e[0m\n" $((max_length + 4)) "$step" "$outcome"
84+
gpu_id=-1
85+
86+
while [ "$gpu_id" -lt 0 ]; do
87+
gpu_id=$(curl -s "http://10.0.23.237/gpu/get?id=${{ github.run_id }}")
88+
89+
if [ "$gpu_id" -lt 0 ]; then
90+
echo "No available GPU, waiting 5 seconds..."
91+
sleep 5
18192
else
182-
printf "\e[31m%-*s Result: %s\e[0m\n" $((max_length + 4)) "$step" "$outcome"
183-
error_occurred=1
93+
echo "Allocated GPU ID: $gpu_id"
18494
fi
18595
done
186-
187-
if [ $error_occurred -eq 1 ]; then
188-
exit 1
189-
fi
96+
echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
97+
echo "CUDA_VISIBLE_DEVICES set to $gpu_id"
98+
99+
- name: Run tests
100+
run: pytest tests/${{ matrix.test_script }}
101+
102+
- name: Release GPU
103+
if: always()
104+
shell: bash
105+
run: curl -X GET "http://10.0.23.237/gpu/release?id=${{ github.run_id }}&gpu=$CUDA_VISIBLE_DEVICES"

0 commit comments

Comments
 (0)