From d9a9ca0d1692c78bad6767301edf4bea8ee212b1 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 1 Jul 2024 09:05:17 -0700 Subject: [PATCH] ci(feat): Auto-retry unit tests --- .gitlab-ci.yml | 48 ++++++++++++++++++++++++++++++++++-------------- jet-tests.yml | 26 +++++++++++++++----------- 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a8e9647017..44e0688873 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,8 +22,8 @@ workflow: stages: - build - - test - - jet + - unit_tests + - functional_tests variables: SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron" @@ -112,7 +112,7 @@ unit_tests: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' @@ -124,12 +124,14 @@ unit_tests: - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH interruptible: true + retry: + max: 2 unit_tests-data: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/data rules: @@ -139,12 +141,14 @@ unit_tests-data: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-dist-checkpointing: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/dist_checkpointing rules: @@ -154,12 +158,14 @@ unit_tests-dist-checkpointing: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-fusions: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/fusions rules: @@ -169,12 +175,14 @@ unit_tests-fusions: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-inference: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/inference rules: @@ -184,12 +192,14 @@ unit_tests-inference: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-models: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/models rules: @@ -199,12 +209,14 @@ unit_tests-models: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-pipeline-parallel: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/pipeline_parallel rules: @@ -214,12 +226,14 @@ unit_tests-pipeline-parallel: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-tensor-parallel: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/tensor_parallel rules: @@ -229,12 +243,14 @@ unit_tests-tensor-parallel: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-transformer: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/transformer rules: @@ -244,12 +260,14 @@ unit_tests-transformer: when: never - when: always interruptible: true + retry: + max: 2 unit_tests-top-py: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: - 8xL40S - stage: test + stage: unit_tests script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/*.py rules: @@ -259,10 +277,12 @@ unit_tests-top-py: when: never - when: always interruptible: true + retry: + max: 2 docs_build_test: image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1 - stage: test + stage: unit_tests tags: - os/linux script: @@ -280,7 +300,7 @@ formatting: image: ${LINTING_IMAGE}:${CI_PIPELINE_ID} tags: - os/linux - stage: test + stage: unit_tests before_script: - git fetch origin main script: diff --git a/jet-tests.yml b/jet-tests.yml index c53fb58a8c..b6e03d2f67 100644 --- a/jet-tests.yml +++ b/jet-tests.yml @@ -1,5 +1,5 @@ .jet_common: - stage: jet + stage: functional_tests rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/' - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Build only/' @@ -17,8 +17,8 @@ include: file: downstreams.yml jet-setup: - extends: [ .jet_common ] - tags: + extends: [.jet_common] + tags: - os/linux script: - set -x @@ -28,6 +28,8 @@ jet-setup: reports: dotenv: config.env interruptible: true + retry: + max: 2 jet-configure: image: @@ -51,16 +53,17 @@ jet-configure: | .spec.source.image = env(IMAGE) ) ' -i tests/functional_tests/jet_recipes/build-pyt.yaml - artifacts: paths: - tests/functional_tests/jet_recipes interruptible: true - + retry: + max: 2 + jet-trigger: - stage: jet + stage: functional_tests extends: [.jet_common, .jet-trigger] - needs: [ metadata, jet-configure, jet-setup ] + needs: [metadata, jet-configure, jet-setup] trigger: project: dl/jet/ci branch: $JET_CI_BRANCH @@ -71,7 +74,7 @@ jet-trigger: - SLURM_CLUSTER - JET_CI_BRANCH variables: - JET_WORKLOADS_FILTER: "$_JET_FILTER" + JET_WORKLOADS_FILTER: '$_JET_FILTER' JET_CUSTOM_CONFIG: | launchers: ${SLURM_CLUSTER}: @@ -80,14 +83,14 @@ jet-trigger: interruptible: true jet-results-summary: - stage: jet + stage: functional_tests image: gitlab-master.nvidia.com:5005/dl/jet/api:latest tags: - os/linux - needs: [ jet-trigger ] + needs: [jet-trigger] before_script: - jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN - script: + script: - env - RW_API_TOKEN=${PROJECT_ACCESS_TOKEN} ENDPOINT=${PROJECT_ENDPOINT} bash tests/functional_tests/shell_test_utils/restart_jet_log_jobs.sh ${CI_PIPELINE_ID} - python -m pip install -U --no-cache-dir prettytable @@ -105,3 +108,4 @@ jet-results-summary: paths: - scripts interruptible: true +