Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/auto-retry-unit-tests' into 'main'
Browse files Browse the repository at this point in the history
ci(feat): Auto-retry unit tests

See merge request ADLR/megatron-lm!1642
  • Loading branch information
terrykong committed Jul 1, 2024
2 parents 6cfc330 + d9a9ca0 commit c7d53da
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 25 deletions.
48 changes: 34 additions & 14 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ workflow:

stages:
- build
- test
- jet
- unit_tests
- functional_tests

variables:
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
Expand Down Expand Up @@ -112,7 +112,7 @@ unit_tests:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
Expand All @@ -124,12 +124,14 @@ unit_tests:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
interruptible: true
retry:
max: 2

unit_tests-data:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/data
rules:
Expand All @@ -139,12 +141,14 @@ unit_tests-data:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-dist-checkpointing:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/dist_checkpointing
rules:
Expand All @@ -154,12 +158,14 @@ unit_tests-dist-checkpointing:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-fusions:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/fusions
rules:
Expand All @@ -169,12 +175,14 @@ unit_tests-fusions:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-inference:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/inference
rules:
Expand All @@ -184,12 +192,14 @@ unit_tests-inference:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-models:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/models
rules:
Expand All @@ -199,12 +209,14 @@ unit_tests-models:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-pipeline-parallel:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/pipeline_parallel
rules:
Expand All @@ -214,12 +226,14 @@ unit_tests-pipeline-parallel:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-tensor-parallel:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/tensor_parallel
rules:
Expand All @@ -229,12 +243,14 @@ unit_tests-tensor-parallel:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-transformer:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/transformer
rules:
Expand All @@ -244,12 +260,14 @@ unit_tests-transformer:
when: never
- when: always
interruptible: true
retry:
max: 2

unit_tests-top-py:
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
tags:
- 8xL40S
stage: test
stage: unit_tests
script:
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/*.py
rules:
Expand All @@ -259,10 +277,12 @@ unit_tests-top-py:
when: never
- when: always
interruptible: true
retry:
max: 2

docs_build_test:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
stage: test
stage: unit_tests
tags:
- os/linux
script:
Expand All @@ -280,7 +300,7 @@ formatting:
image: ${LINTING_IMAGE}:${CI_PIPELINE_ID}
tags:
- os/linux
stage: test
stage: unit_tests
before_script:
- git fetch origin main
script:
Expand Down
26 changes: 15 additions & 11 deletions jet-tests.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.jet_common:
stage: jet
stage: functional_tests
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Build only/'
Expand All @@ -17,8 +17,8 @@ include:
file: downstreams.yml

jet-setup:
extends: [ .jet_common ]
tags:
extends: [.jet_common]
tags:
- os/linux
script:
- set -x
Expand All @@ -28,6 +28,8 @@ jet-setup:
reports:
dotenv: config.env
interruptible: true
retry:
max: 2

jet-configure:
image:
Expand All @@ -51,16 +53,17 @@ jet-configure:
| .spec.source.image = env(IMAGE)
)
' -i tests/functional_tests/jet_recipes/build-pyt.yaml
artifacts:
paths:
- tests/functional_tests/jet_recipes
interruptible: true

retry:
max: 2

jet-trigger:
stage: jet
stage: functional_tests
extends: [.jet_common, .jet-trigger]
needs: [ metadata, jet-configure, jet-setup ]
needs: [metadata, jet-configure, jet-setup]
trigger:
project: dl/jet/ci
branch: $JET_CI_BRANCH
Expand All @@ -71,7 +74,7 @@ jet-trigger:
- SLURM_CLUSTER
- JET_CI_BRANCH
variables:
JET_WORKLOADS_FILTER: "$_JET_FILTER"
JET_WORKLOADS_FILTER: '$_JET_FILTER'
JET_CUSTOM_CONFIG: |
launchers:
${SLURM_CLUSTER}:
Expand All @@ -80,14 +83,14 @@ jet-trigger:
interruptible: true

jet-results-summary:
stage: jet
stage: functional_tests
image: gitlab-master.nvidia.com:5005/dl/jet/api:latest
tags:
- os/linux
needs: [ jet-trigger ]
needs: [jet-trigger]
before_script:
- jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN
script:
script:
- env
- RW_API_TOKEN=${PROJECT_ACCESS_TOKEN} ENDPOINT=${PROJECT_ENDPOINT} bash tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh ${CI_PIPELINE_ID}
- python -m pip install -U --no-cache-dir prettytable
Expand All @@ -105,3 +108,4 @@ jet-results-summary:
paths:
- scripts
interruptible: true

0 comments on commit c7d53da

Please sign in to comment.