Skip to content

Commit

Permalink
Test
Browse files Browse the repository at this point in the history
  • Loading branch information
tengyifei committed Feb 9, 2025
1 parent e6f2f75 commit 9711447
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cpu_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
branches:
- main
pull_request:
schedule:
schedule: # Schedule the job run at 12AM PST daily.
- cron: "0 8 * * *"

jobs:
Expand Down
152 changes: 152 additions & 0 deletions .github/workflows/e2e_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
name: E2E tests

on:
push:
branches:
- main
pull_request:
schedule: # Schedule the job run at 12AM PST daily.
- cron: "0 8 * * *"

# Note:
# v6e cluster used: http://shortn/_oswg6PyPo6
# If we migrate to another project, these configs should be updated accordingly.
env:
XPK_CLUSTER_NAME: tpu-v6e-ci
GCP_PROJECT: tpu-pytorch
GCP_ZONE: us-central2
TPU_TYPE: v6e-4

jobs:
tp-run:
name: Train models via 'tp run'
runs-on: [ubuntu-22.04]
env:
ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }}
RUN_ID: ${{ github.run_id }}-${{ github.run_attempt }}
outputs:
llama-3-8b-name: ${{ steps.run-llama-3-8b.outputs.name }}
steps:
- name: Maximize build space
uses: AdityaGarg8/[email protected]
with:
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
- name: Use Docker in rootless mode
uses: ScribeMD/[email protected]
- name: Add user to docker group
run: |
sudo usermod -aG docker $USER
newgrp docker
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Install dev dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev]'
- uses: 'google-github-actions/auth@v2'
with:
# Googlers: if this fails, follow http://shortn/_61iSj31q1b to debug.
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Activate SA credentials
run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
- name: tp doctor
run: tp doctor
- name: tp use
run: >
tp use
--project $GCP_PROJECT
--zone $GCP_ZONE
--cluster $XPK_CLUSTER_NAME
--num-slices 1
--artifact-dir $ARTIFACT_DIR
--tpu-type $TPU_TYPE
- name: Run Llama 3.0 8B
id: run-llama-3-8b
env:
# TODO(https://github.com/AI-Hypercomputer/torchprime/issues/14): Remove and burn the token.
HF_TOKEN: ${{ secrets.HF_TOKEN }}
XLA_IR_DEBUG: 1
XLA_HLO_DEBUG: 1
run: |
random_id=$(cat /dev/urandom | tr -dc a-z0-9 | head -c 8)
name="llama-3-8b-$random_id"
tp run \
--name $name \
torchprime/torch_xla_models/train.py \
model=llama-3-8b \
global_batch_size=8 \
mesh.fsdp=4 \
dataset_config_name=wikitext-103-raw-v1 \
profile_step=3 \
max_steps=15
echo "name=$name" >> "$GITHUB_OUTPUT"
llama-3-8b:
name: Check Llama 3.0 8B
needs: tp-run
runs-on: [ubuntu-22.04]
env:
ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }}
JOBSET_NAME: ${{needs.tp-run.outputs.llama-3-8b-name}}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Install dev dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev]'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: Activate SA credentials
run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
- name: tp use
run: >
tp use
--project $GCP_PROJECT
--zone $GCP_ZONE
--cluster $XPK_CLUSTER_NAME
--num-slices 1
--artifact-dir $ARTIFACT_DIR
--tpu-type $TPU_TYPE
- name: Get GKE credentials
run: |
gcloud container clusters get-credentials $XPK_CLUSTER_NAME --region=$GCP_ZONE --project=$GCP_PROJECT
kubectl config view
kubectl config set-context --current --namespace=default
- name: Stream logs
run: |
pod_name=$(kubectl get pods -l jobset.sigs.k8s.io/jobset-name=$JOBSET_NAME -o json | jq --raw-output '.items[0].metadata.name')
kubectl logs -c jax-tpu -f $pod_name
# TODO: stream logs
# TODO: wait for completion
# xpk workload list --cluster tpu-v6e-ci --wait-for-job-completion=runner-xpk-v6e-4-1-20250209-071441
# TODO: check that there is a 'Step duration: ... s'
# TODO: check that there is a 'Finished training run'
# TODO: check that there is profile
# TODO: delete workload
# xpk workload delete --workload xpk-test-workload --cluster xpk-test
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,6 @@ htmlcov/

# torchprime config
.config

# Google cloud credentials generated during CI
gha-creds-*.json
1 change: 0 additions & 1 deletion torchprime/launcher/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0_3.10_tpuvm_cxx11

# Install system dependencies
RUN apt-get update && apt-get upgrade -y
RUN apt-get update && apt-get install -y curl gnupg

# Add the Google Cloud SDK package repository
Expand Down
33 changes: 25 additions & 8 deletions torchprime/launcher/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,9 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config):
runner.run(
[
"gcloud",
"auth",
"application-default",
"set-quota-project",
"config",
"set",
"billing/quota_project",
config.project,
],
)
Expand Down Expand Up @@ -200,8 +200,15 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config):
)
)
@click.argument("args", nargs=-1, type=click.UNPROCESSED)
@click.option(
"--name",
required=False,
help="Name of the workload (jobset). If not specified, "
"defaults to one based on the date and time.",
default=None,
)
@interactive
def run(args):
def run(args, name: str | None):
"""
Runs the provided SPMD training command as an xpk job on a GKE cluster.
"""
Expand All @@ -216,7 +223,12 @@ def run(args):
docker_url = buildpush(docker_project)

# Submit xpk workload
datetime_str = datetime.now().strftime("%Y%m%d-%H%M%S")
workload_name = name
if workload_name is None:
datetime_str = datetime.now().strftime("%Y%m%d-%H%M%S")
workload_name = (
f"{os.environ['USER']}-xpk-{config.tpu_type}-{config.num_slices}-{datetime_str}"
)
command = ["python", "torchprime/launcher/thunk.py"] + list(args)

# Forward a bunch of important env vars.
Expand All @@ -226,8 +238,13 @@ def run(args):
*forward_env("XLA_HLO_DEBUG"), # torch_xla debugging flag
]

# Pass artifact dir as another env var.
artifact_arg = ["--env", f"TORCHPRIME_ARTIFACT_DIR={config.artifact_dir}"]
# Pass artifact dir and jobset name as env vars.
artifact_arg = [
"--env",
f"TORCHPRIME_ARTIFACT_DIR={config.artifact_dir}",
"--env",
f"TORCHPRIME_JOBSET_NAME={workload_name}",
]

ensure_command("xpk")
xpk_command = (
Expand All @@ -240,7 +257,7 @@ def run(args):
"--docker-image",
docker_url,
"--workload",
f"{os.environ['USER']}-xpk-{config.tpu_type}-{config.num_slices}-{datetime_str}",
workload_name,
"--tpu-type",
config.tpu_type,
"--num-slices",
Expand Down
8 changes: 4 additions & 4 deletions torchprime/launcher/thunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,19 @@
# Configure XLA graph dump path before doing anything else.
date_string = datetime.now().strftime("%Y%m%d-%H%M")
host_name = f"{slice_id}-{worker_id}"
jobset_name = os.getenv("JOBSET_NAME", date_string)
xla_dump_path = f"{mounted_artifact_dir}/{host_name}/xla_dumps/{jobset_name}/"
jobset_name = os.getenv("TORCHPRIME_JOBSET_NAME", date_string)
xla_dump_path = mounted_artifact_dir / "xla_dumps" / jobset_name / host_name
os.environ["XLA_FLAGS"] = " ".join(
[
os.getenv("XLA_FLAGS", ""),
f"--xla_dump_to={xla_dump_path}",
f"--xla_dump_to={xla_dump_path}/",
"--xla_dump_hlo_as_proto",
]
)
print(f"Dumping XLA compiler outputs to {xla_dump_path}", flush=True)

# Determine the profile dir
profile_dir = mounted_artifact_dir / host_name
profile_dir = mounted_artifact_dir / "profile" / jobset_name / host_name
print(f"Profile output directory: {profile_dir}", flush=True)

# Exec into the training script.
Expand Down

0 comments on commit 9711447

Please sign in to comment.