Skip to content

Use new DeviceMesh unflatten to rewrite parallel_dims #1808

Use new DeviceMesh unflatten to rewrite parallel_dims

Use new DeviceMesh unflatten to rewrite parallel_dims #1808

name: 8 GPU Feature Tests
on:
push:
branches: [ main ]
paths-ignore:
- 'torchtitan/experiments/**'
pull_request:
paths-ignore:
- 'torchtitan/experiments/**'
schedule:
# Runs every 6 hours
- cron: '0 */6 * * *'
concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
defaults:
run:
shell: bash -l -eo pipefail {0}
permissions:
id-token: write
contents: read
jobs:
# Step 1: Dynamically compute the matrix based on conditions
set-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set.outputs.matrix }}
steps:
- id: set
run: |
# Decide which matrix entries to include based on event type
if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
# Include both CUDA and ROCm
echo '{"include":[
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"},
{"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"}
]}' > matrix.json
else
# Include only CUDA
echo '{"include":[
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}
]}' > matrix.json
fi
# Export matrix to job outputs
{
echo 'matrix<<EOF'
cat matrix.json
echo 'EOF'
} >> $GITHUB_OUTPUT
# Step 2: Use the dynamic matrix in the build-test job
build-test:
needs: set-matrix
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
with:
runner: ${{ matrix.runner }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
docker-image: ${{ matrix.docker-image }}
repository: pytorch/torchtitan
upload-artifact: outputs
timeout: 45
script: |
set -eux
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"
pip config --user set global.progress_bar off
python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
# Verify the accuracy first.
echo "Checking FSDP8 v.s. HSDP (4, 2) accuracy parity"
export baseline_options="--parallelism.data_parallel_replicate_degree=1"
export test_options="--parallelism.data_parallel_replicate_degree=4"
python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --import-result tests/assets/losses/llama3.txt
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*
python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
# Cleanup the checkpoints so that we don't waste network bandwidth and time.
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
rm -rf artifacts-to-be-uploaded/*/checkpoint