From 97114473032099ed74233e660b161791f5c1bd70 Mon Sep 17 00:00:00 2001
From: Yifei Teng <yifeit@google.com>
Date: Sat, 8 Feb 2025 18:47:49 -0800
Subject: [PATCH] Test

---
 .github/workflows/cpu_test.yml |   2 +-
 .github/workflows/e2e_test.yml | 152 +++++++++++++++++++++++++++++++++
 .gitignore                     |   3 +
 torchprime/launcher/Dockerfile |   1 -
 torchprime/launcher/cli.py     |  33 +++++--
 torchprime/launcher/thunk.py   |   8 +-
 6 files changed, 185 insertions(+), 14 deletions(-)
 create mode 100644 .github/workflows/e2e_test.yml

diff --git a/.github/workflows/cpu_test.yml b/.github/workflows/cpu_test.yml
index 1eae235..1561f25 100644
--- a/.github/workflows/cpu_test.yml
+++ b/.github/workflows/cpu_test.yml
@@ -5,7 +5,7 @@ on:
     branches:
       - main
   pull_request:
-  schedule:
+  schedule:  # Schedule the job run at 12AM PST daily.
   - cron: "0 8 * * *"
 
 jobs:
diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml
new file mode 100644
index 0000000..caf5236
--- /dev/null
+++ b/.github/workflows/e2e_test.yml
@@ -0,0 +1,152 @@
+name: E2E tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  schedule:  # Schedule the job run at 12AM PST daily.
+  - cron: "0 8 * * *"
+
+# Note:
+# v6e cluster used: http://shortn/_oswg6PyPo6
+# If we migrate to another project, these configs should be updated accordingly.
+env:
+  XPK_CLUSTER_NAME: tpu-v6e-ci
+  GCP_PROJECT: tpu-pytorch
+  GCP_ZONE: us-central2
+  TPU_TYPE: v6e-4
+
+jobs:
+  tp-run:
+    name: Train models via 'tp run'
+    runs-on: [ubuntu-22.04]
+    env:
+      ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }}
+      RUN_ID: ${{ github.run_id }}-${{ github.run_attempt }}
+    outputs:
+      llama-3-8b-name: ${{ steps.run-llama-3-8b.outputs.name }}
+    steps:
+    - name: Maximize build space
+      uses: AdityaGarg8/remove-unwanted-software@v4.1
+      with:
+        remove-dotnet: 'true'
+        remove-android: 'true'
+        remove-haskell: 'true'
+        remove-codeql: 'true'
+    - name: Use Docker in rootless mode
+      uses: ScribeMD/rootless-docker@0.2.2
+    - name: Add user to docker group
+      run: |
+        sudo usermod -aG docker $USER
+        newgrp docker
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+        cache: 'pip'
+    - name: Install dev dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev]'
+    - uses: 'google-github-actions/auth@v2'
+      with:
+        # Googlers: if this fails, follow http://shortn/_61iSj31q1b to debug.
+        credentials_json: '${{ secrets.GCP_SA_KEY }}'
+    - uses: google-github-actions/setup-gcloud@v2
+      with:
+        version: '>= 363.0.0'
+        install_components: 'beta,gke-gcloud-auth-plugin'
+    - name: Verify gcp setup
+      run: gcloud info
+    - name: Authenticate Docker
+      run: gcloud auth configure-docker --quiet
+    - name: Activate SA credentials
+      run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
+    - name: tp doctor
+      run: tp doctor
+    - name: tp use
+      run: >
+        tp use
+        --project $GCP_PROJECT
+        --zone $GCP_ZONE
+        --cluster $XPK_CLUSTER_NAME
+        --num-slices 1
+        --artifact-dir $ARTIFACT_DIR
+        --tpu-type $TPU_TYPE
+    - name: Run Llama 3.0 8B
+      id: run-llama-3-8b
+      env:
+        # TODO(https://github.com/AI-Hypercomputer/torchprime/issues/14): Remove and burn the token.
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        XLA_IR_DEBUG: 1
+        XLA_HLO_DEBUG: 1
+      run: |
+        random_id=$(cat /dev/urandom | tr -dc a-z0-9 | head -c 8)
+        name="llama-3-8b-$random_id"
+
+        tp run \
+          --name $name \
+          torchprime/torch_xla_models/train.py \
+          model=llama-3-8b \
+          global_batch_size=8 \
+          mesh.fsdp=4 \
+          dataset_config_name=wikitext-103-raw-v1 \
+          profile_step=3 \
+          max_steps=15
+
+        echo "name=$name" >> "$GITHUB_OUTPUT"
+  llama-3-8b:
+    name: Check Llama 3.0 8B
+    needs: tp-run
+    runs-on: [ubuntu-22.04]
+    env:
+      ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }}
+      JOBSET_NAME: ${{needs.tp-run.outputs.llama-3-8b-name}}
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+        cache: 'pip'
+    - name: Install dev dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev]'
+    - uses: 'google-github-actions/auth@v2'
+      with:
+        credentials_json: '${{ secrets.GCP_SA_KEY }}'
+    - uses: google-github-actions/setup-gcloud@v2
+      with:
+        version: '>= 363.0.0'
+        install_components: 'beta,gke-gcloud-auth-plugin'
+    - name: Verify gcp setup
+      run: gcloud info
+    - name: Activate SA credentials
+      run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
+    - name: tp use
+      run: >
+        tp use
+        --project $GCP_PROJECT
+        --zone $GCP_ZONE
+        --cluster $XPK_CLUSTER_NAME
+        --num-slices 1
+        --artifact-dir $ARTIFACT_DIR
+        --tpu-type $TPU_TYPE
+    - name: Get GKE credentials
+      run: |
+        gcloud container clusters get-credentials $XPK_CLUSTER_NAME --region=$GCP_ZONE --project=$GCP_PROJECT
+        kubectl config view
+        kubectl config set-context --current --namespace=default
+    - name: Stream logs
+      run: |
+        pod_name=$(kubectl get pods -l jobset.sigs.k8s.io/jobset-name=$JOBSET_NAME -o json | jq --raw-output '.items[0].metadata.name')
+        kubectl logs -c jax-tpu -f $pod_name
+    # TODO: stream logs
+    # TODO: wait for completion
+    # xpk workload list --cluster tpu-v6e-ci --wait-for-job-completion=runner-xpk-v6e-4-1-20250209-071441
+    # TODO: check that there is a 'Step duration: ... s'
+    # TODO: check that there is a 'Finished training run'
+    # TODO: check that there is profile
+    # TODO: delete workload
+    # xpk workload delete --workload xpk-test-workload --cluster xpk-test
diff --git a/.gitignore b/.gitignore
index ff78a1e..fd421dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,6 @@ htmlcov/
 
 # torchprime config
 .config
+
+# Google cloud credentials generated during CI
+gha-creds-*.json
diff --git a/torchprime/launcher/Dockerfile b/torchprime/launcher/Dockerfile
index f4678e7..51b3a4a 100644
--- a/torchprime/launcher/Dockerfile
+++ b/torchprime/launcher/Dockerfile
@@ -4,7 +4,6 @@
 FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0_3.10_tpuvm_cxx11
 
 # Install system dependencies
-RUN apt-get update && apt-get upgrade -y
 RUN apt-get update && apt-get install -y curl gnupg
 
 # Add the Google Cloud SDK package repository
diff --git a/torchprime/launcher/cli.py b/torchprime/launcher/cli.py
index 3791ada..59164fb 100644
--- a/torchprime/launcher/cli.py
+++ b/torchprime/launcher/cli.py
@@ -168,9 +168,9 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config):
   runner.run(
     [
       "gcloud",
-      "auth",
-      "application-default",
-      "set-quota-project",
+      "config",
+      "set",
+      "billing/quota_project",
       config.project,
     ],
   )
@@ -200,8 +200,15 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config):
   )
 )
 @click.argument("args", nargs=-1, type=click.UNPROCESSED)
+@click.option(
+  "--name",
+  required=False,
+  help="Name of the workload (jobset). If not specified, "
+  "defaults to one based on the date and time.",
+  default=None,
+)
 @interactive
-def run(args):
+def run(args, name: str | None):
   """
   Runs the provided SPMD training command as an xpk job on a GKE cluster.
   """
@@ -216,7 +223,12 @@ def run(args):
   docker_url = buildpush(docker_project)
 
   # Submit xpk workload
-  datetime_str = datetime.now().strftime("%Y%m%d-%H%M%S")
+  workload_name = name
+  if workload_name is None:
+    datetime_str = datetime.now().strftime("%Y%m%d-%H%M%S")
+    workload_name = (
+      f"{os.environ['USER']}-xpk-{config.tpu_type}-{config.num_slices}-{datetime_str}"
+    )
   command = ["python", "torchprime/launcher/thunk.py"] + list(args)
 
   # Forward a bunch of important env vars.
@@ -226,8 +238,13 @@ def run(args):
     *forward_env("XLA_HLO_DEBUG"),  # torch_xla debugging flag
   ]
 
-  # Pass artifact dir as another env var.
-  artifact_arg = ["--env", f"TORCHPRIME_ARTIFACT_DIR={config.artifact_dir}"]
+  # Pass artifact dir and jobset name as env vars.
+  artifact_arg = [
+    "--env",
+    f"TORCHPRIME_ARTIFACT_DIR={config.artifact_dir}",
+    "--env",
+    f"TORCHPRIME_JOBSET_NAME={workload_name}",
+  ]
 
   ensure_command("xpk")
   xpk_command = (
@@ -240,7 +257,7 @@ def run(args):
       "--docker-image",
       docker_url,
       "--workload",
-      f"{os.environ['USER']}-xpk-{config.tpu_type}-{config.num_slices}-{datetime_str}",
+      workload_name,
       "--tpu-type",
       config.tpu_type,
       "--num-slices",
diff --git a/torchprime/launcher/thunk.py b/torchprime/launcher/thunk.py
index b719e61..0deb0be 100644
--- a/torchprime/launcher/thunk.py
+++ b/torchprime/launcher/thunk.py
@@ -29,19 +29,19 @@
 # Configure XLA graph dump path before doing anything else.
 date_string = datetime.now().strftime("%Y%m%d-%H%M")
 host_name = f"{slice_id}-{worker_id}"
-jobset_name = os.getenv("JOBSET_NAME", date_string)
-xla_dump_path = f"{mounted_artifact_dir}/{host_name}/xla_dumps/{jobset_name}/"
+jobset_name = os.getenv("TORCHPRIME_JOBSET_NAME", date_string)
+xla_dump_path = mounted_artifact_dir / "xla_dumps" / jobset_name / host_name
 os.environ["XLA_FLAGS"] = " ".join(
   [
     os.getenv("XLA_FLAGS", ""),
-    f"--xla_dump_to={xla_dump_path}",
+    f"--xla_dump_to={xla_dump_path}/",
     "--xla_dump_hlo_as_proto",
   ]
 )
 print(f"Dumping XLA compiler outputs to {xla_dump_path}", flush=True)
 
 # Determine the profile dir
-profile_dir = mounted_artifact_dir / host_name
+profile_dir = mounted_artifact_dir / "profile" / jobset_name / host_name
 print(f"Profile output directory: {profile_dir}", flush=True)
 
 # Exec into the training script.