From 97114473032099ed74233e660b161791f5c1bd70 Mon Sep 17 00:00:00 2001 From: Yifei Teng Date: Sat, 8 Feb 2025 18:47:49 -0800 Subject: [PATCH] Test --- .github/workflows/cpu_test.yml | 2 +- .github/workflows/e2e_test.yml | 152 +++++++++++++++++++++++++++++++++ .gitignore | 3 + torchprime/launcher/Dockerfile | 1 - torchprime/launcher/cli.py | 33 +++++-- torchprime/launcher/thunk.py | 8 +- 6 files changed, 185 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/e2e_test.yml diff --git a/.github/workflows/cpu_test.yml b/.github/workflows/cpu_test.yml index 1eae235..1561f25 100644 --- a/.github/workflows/cpu_test.yml +++ b/.github/workflows/cpu_test.yml @@ -5,7 +5,7 @@ on: branches: - main pull_request: - schedule: + schedule: # Schedule the job run at 12AM PST daily. - cron: "0 8 * * *" jobs: diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml new file mode 100644 index 0000000..caf5236 --- /dev/null +++ b/.github/workflows/e2e_test.yml @@ -0,0 +1,152 @@ +name: E2E tests + +on: + push: + branches: + - main + pull_request: + schedule: # Schedule the job run at 12AM PST daily. + - cron: "0 8 * * *" + +# Note: +# v6e cluster used: http://shortn/_oswg6PyPo6 +# If we migrate to another project, these configs should be updated accordingly. +env: + XPK_CLUSTER_NAME: tpu-v6e-ci + GCP_PROJECT: tpu-pytorch + GCP_ZONE: us-central2 + TPU_TYPE: v6e-4 + +jobs: + tp-run: + name: Train models via 'tp run' + runs-on: [ubuntu-22.04] + env: + ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }} + RUN_ID: ${{ github.run_id }}-${{ github.run_attempt }} + outputs: + llama-3-8b-name: ${{ steps.run-llama-3-8b.outputs.name }} + steps: + - name: Maximize build space + uses: AdityaGarg8/remove-unwanted-software@v4.1 + with: + remove-dotnet: 'true' + remove-android: 'true' + remove-haskell: 'true' + remove-codeql: 'true' + - name: Use Docker in rootless mode + uses: ScribeMD/rootless-docker@0.2.2 + - name: Add user to docker group + run: | + sudo usermod -aG docker $USER + newgrp docker + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Install dev dependencies + run: | + python -m pip install --upgrade pip + pip install -e '.[dev]' + - uses: 'google-github-actions/auth@v2' + with: + # Googlers: if this fails, follow http://shortn/_61iSj31q1b to debug. + credentials_json: '${{ secrets.GCP_SA_KEY }}' + - uses: google-github-actions/setup-gcloud@v2 + with: + version: '>= 363.0.0' + install_components: 'beta,gke-gcloud-auth-plugin' + - name: Verify gcp setup + run: gcloud info + - name: Authenticate Docker + run: gcloud auth configure-docker --quiet + - name: Activate SA credentials + run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS + - name: tp doctor + run: tp doctor + - name: tp use + run: > + tp use + --project $GCP_PROJECT + --zone $GCP_ZONE + --cluster $XPK_CLUSTER_NAME + --num-slices 1 + --artifact-dir $ARTIFACT_DIR + --tpu-type $TPU_TYPE + - name: Run Llama 3.0 8B + id: run-llama-3-8b + env: + # TODO(https://github.com/AI-Hypercomputer/torchprime/issues/14): Remove and burn the token. + HF_TOKEN: ${{ secrets.HF_TOKEN }} + XLA_IR_DEBUG: 1 + XLA_HLO_DEBUG: 1 + run: | + random_id=$(cat /dev/urandom | tr -dc a-z0-9 | head -c 8) + name="llama-3-8b-$random_id" + + tp run \ + --name $name \ + torchprime/torch_xla_models/train.py \ + model=llama-3-8b \ + global_batch_size=8 \ + mesh.fsdp=4 \ + dataset_config_name=wikitext-103-raw-v1 \ + profile_step=3 \ + max_steps=15 + + echo "name=$name" >> "$GITHUB_OUTPUT" + llama-3-8b: + name: Check Llama 3.0 8B + needs: tp-run + runs-on: [ubuntu-22.04] + env: + ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }} + JOBSET_NAME: ${{needs.tp-run.outputs.llama-3-8b-name}} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Install dev dependencies + run: | + python -m pip install --upgrade pip + pip install -e '.[dev]' + - uses: 'google-github-actions/auth@v2' + with: + credentials_json: '${{ secrets.GCP_SA_KEY }}' + - uses: google-github-actions/setup-gcloud@v2 + with: + version: '>= 363.0.0' + install_components: 'beta,gke-gcloud-auth-plugin' + - name: Verify gcp setup + run: gcloud info + - name: Activate SA credentials + run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS + - name: tp use + run: > + tp use + --project $GCP_PROJECT + --zone $GCP_ZONE + --cluster $XPK_CLUSTER_NAME + --num-slices 1 + --artifact-dir $ARTIFACT_DIR + --tpu-type $TPU_TYPE + - name: Get GKE credentials + run: | + gcloud container clusters get-credentials $XPK_CLUSTER_NAME --region=$GCP_ZONE --project=$GCP_PROJECT + kubectl config view + kubectl config set-context --current --namespace=default + - name: Stream logs + run: | + pod_name=$(kubectl get pods -l jobset.sigs.k8s.io/jobset-name=$JOBSET_NAME -o json | jq --raw-output '.items[0].metadata.name') + kubectl logs -c jax-tpu -f $pod_name + # TODO: stream logs + # TODO: wait for completion + # xpk workload list --cluster tpu-v6e-ci --wait-for-job-completion=runner-xpk-v6e-4-1-20250209-071441 + # TODO: check that there is a 'Step duration: ... s' + # TODO: check that there is a 'Finished training run' + # TODO: check that there is profile + # TODO: delete workload + # xpk workload delete --workload xpk-test-workload --cluster xpk-test diff --git a/.gitignore b/.gitignore index ff78a1e..fd421dd 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,6 @@ htmlcov/ # torchprime config .config + +# Google cloud credentials generated during CI +gha-creds-*.json diff --git a/torchprime/launcher/Dockerfile b/torchprime/launcher/Dockerfile index f4678e7..51b3a4a 100644 --- a/torchprime/launcher/Dockerfile +++ b/torchprime/launcher/Dockerfile @@ -4,7 +4,6 @@ FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0_3.10_tpuvm_cxx11 # Install system dependencies -RUN apt-get update && apt-get upgrade -y RUN apt-get update && apt-get install -y curl gnupg # Add the Google Cloud SDK package repository diff --git a/torchprime/launcher/cli.py b/torchprime/launcher/cli.py index 3791ada..59164fb 100644 --- a/torchprime/launcher/cli.py +++ b/torchprime/launcher/cli.py @@ -168,9 +168,9 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config): runner.run( [ "gcloud", - "auth", - "application-default", - "set-quota-project", + "config", + "set", + "billing/quota_project", config.project, ], ) @@ -200,8 +200,15 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config): ) ) @click.argument("args", nargs=-1, type=click.UNPROCESSED) +@click.option( + "--name", + required=False, + help="Name of the workload (jobset). If not specified, " + "defaults to one based on the date and time.", + default=None, +) @interactive -def run(args): +def run(args, name: str | None): """ Runs the provided SPMD training command as an xpk job on a GKE cluster. """ @@ -216,7 +223,12 @@ def run(args): docker_url = buildpush(docker_project) # Submit xpk workload - datetime_str = datetime.now().strftime("%Y%m%d-%H%M%S") + workload_name = name + if workload_name is None: + datetime_str = datetime.now().strftime("%Y%m%d-%H%M%S") + workload_name = ( + f"{os.environ['USER']}-xpk-{config.tpu_type}-{config.num_slices}-{datetime_str}" + ) command = ["python", "torchprime/launcher/thunk.py"] + list(args) # Forward a bunch of important env vars. @@ -226,8 +238,13 @@ def run(args): *forward_env("XLA_HLO_DEBUG"), # torch_xla debugging flag ] - # Pass artifact dir as another env var. - artifact_arg = ["--env", f"TORCHPRIME_ARTIFACT_DIR={config.artifact_dir}"] + # Pass artifact dir and jobset name as env vars. + artifact_arg = [ + "--env", + f"TORCHPRIME_ARTIFACT_DIR={config.artifact_dir}", + "--env", + f"TORCHPRIME_JOBSET_NAME={workload_name}", + ] ensure_command("xpk") xpk_command = ( @@ -240,7 +257,7 @@ def run(args): "--docker-image", docker_url, "--workload", - f"{os.environ['USER']}-xpk-{config.tpu_type}-{config.num_slices}-{datetime_str}", + workload_name, "--tpu-type", config.tpu_type, "--num-slices", diff --git a/torchprime/launcher/thunk.py b/torchprime/launcher/thunk.py index b719e61..0deb0be 100644 --- a/torchprime/launcher/thunk.py +++ b/torchprime/launcher/thunk.py @@ -29,19 +29,19 @@ # Configure XLA graph dump path before doing anything else. date_string = datetime.now().strftime("%Y%m%d-%H%M") host_name = f"{slice_id}-{worker_id}" -jobset_name = os.getenv("JOBSET_NAME", date_string) -xla_dump_path = f"{mounted_artifact_dir}/{host_name}/xla_dumps/{jobset_name}/" +jobset_name = os.getenv("TORCHPRIME_JOBSET_NAME", date_string) +xla_dump_path = mounted_artifact_dir / "xla_dumps" / jobset_name / host_name os.environ["XLA_FLAGS"] = " ".join( [ os.getenv("XLA_FLAGS", ""), - f"--xla_dump_to={xla_dump_path}", + f"--xla_dump_to={xla_dump_path}/", "--xla_dump_hlo_as_proto", ] ) print(f"Dumping XLA compiler outputs to {xla_dump_path}", flush=True) # Determine the profile dir -profile_dir = mounted_artifact_dir / host_name +profile_dir = mounted_artifact_dir / "profile" / jobset_name / host_name print(f"Profile output directory: {profile_dir}", flush=True) # Exec into the training script.