Skip to content

Commit

Permalink
Use tensorboard on demand
Browse files Browse the repository at this point in the history
  • Loading branch information
neet committed Mar 24, 2024
1 parent afbd766 commit 76a69cc
Show file tree
Hide file tree
Showing 17 changed files with 266 additions and 58 deletions.
6 changes: 5 additions & 1 deletion cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
"gapic",
"googleapis",
"huggingface",
"hypertune",
"hypertuner",
"idxmin",
"kaniko",
"logdir",
"neetlab",
"protobuf"
"protobuf",
"tensorboard"
],
"ignoreWords": [],
"import": []
Expand Down
88 changes: 81 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ readme = "README.md"
package-mode = false

[tool.poetry.dependencies]
python = ">=3.8,<3.12.0"
python = ">=3.10,<3.12.0"
datasets = "^2.18.0"
transformers = "^4.38.2"
sentencepiece = "^0.2.0"
Expand All @@ -18,6 +18,7 @@ cloudml-hypertune = "^0.1.0.dev6"
accelerate = "^0.28.0"
google-cloud-aiplatform = "^1.44.0"
google-cloud-pipeline-components = "^2.11.0"
tensorboard = "^2.16.2"


[tool.poetry.group.dev.dependencies]
Expand Down
4 changes: 4 additions & 0 deletions src/ainu_lm_pipeline/components/get_worker_pool_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
def get_worker_pool_specs(
train_image_uri: str,
tokenizer_gcs_path: str,
tensorboard_id: str,
tensorboard_experiment_name: str,
) -> list:
worker_pool_specs = [
{
Expand All @@ -17,6 +19,8 @@ def get_worker_pool_specs(
"language-model",
"--hp-tune=True",
f"--tokenizer={tokenizer_gcs_path}",
f"--tensorboard-id={tensorboard_id}",
f"--tensorboard-experiment-name={tensorboard_experiment_name}",
],
},
"machine_spec": {
Expand Down
3 changes: 3 additions & 0 deletions src/ainu_lm_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def ainu_lm_pipeline(
pipeline_root: str,
source_repo_name: str,
source_commit_sha: str,
tensorboard_id: str,
hf_repo: str,
hf_token: str,
) -> None:
Expand Down Expand Up @@ -85,6 +86,8 @@ def ainu_lm_pipeline(
worker_pool_specs_task = (
get_worker_pool_specs(
train_image_uri=cfg.TRAIN_IMAGE_URI,
tensorboard_id=tensorboard_id,
tensorboard_experiment_name=pipeline_job_id,
tokenizer_gcs_path=tokenizer_training_job_details_task.outputs[
"model_artifacts"
],
Expand Down
10 changes: 3 additions & 7 deletions src/ainu_lm_pipeline/submit.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
import argparse
import os
from datetime import datetime

import config as cfg
from google.cloud import aiplatform
from google.cloud.aiplatform.pipeline_jobs import PipelineJob


def get_timestamp() -> str:
return datetime.now().strftime("%Y%m%d%H%M%S")

from . import config as cfg

parser = argparse.ArgumentParser()
parser.add_argument("--commit-sha", type=str, required=True)
Expand All @@ -19,13 +14,14 @@ def get_timestamp() -> str:
aiplatform.init(project=cfg.PROJECT_ID, location=cfg.REGION)

args = parser.parse_args()
job_id = f"pipeline-{cfg.APP_NAME}-{get_timestamp()}"
job_id = f"pipeline-ainu-lm-{args.commit_sha}"

pipeline_params = {
"pipeline_job_id": job_id,
"pipeline_root": cfg.PIPELINE_ROOT,
"source_repo_name": "github_aynumosir_ainu-lm",
"source_commit_sha": args.commit_sha,
"tensorboard_id": os.environ.get("TENSORBOARD_ID"),
"hf_repo": "aynumosir/roberta-ainu-base",
"hf_token": os.environ.get("HF_TOKEN"),
}
Expand Down
14 changes: 8 additions & 6 deletions src/ainu_lm_trainer/app/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,6 @@ def get_argument_parser() -> argparse.ArgumentParser:
default=False,
help="Whether to use hyperparameter tuning",
)
language_model_parser.add_argument(
"--model-name",
type=str,
help="Model name to train (e.g. roberta-base-ainu)",
default=os.environ.get("MODEL_NAME"),
)
language_model_parser.add_argument(
"--num-train-epochs", type=int, help="Number of training epochs", default=10
)
Expand All @@ -51,6 +45,14 @@ def get_argument_parser() -> argparse.ArgumentParser:
help="Job directory. Use gs:/ to save to Google Cloud Storage",
default=os.environ.get("AIP_MODEL_DIR"),
)
language_model_parser.add_argument(
"--tensorboard-id",
help="Tensorboard ID",
)
language_model_parser.add_argument(
"--tensorboard-experiment-display-name",
help="Tensorboard experiment display name",
)

"""
Subparser for the cache
Expand Down
2 changes: 0 additions & 2 deletions src/ainu_lm_trainer/app/argument_parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,13 @@ def test_parsing_language_model_training() -> None:
[
"language_model",
"--hp-tune=True",
"--model-name=test-model",
"--num-train-epochs=20",
"--tokenizer-dir=gs://test/tokenizer",
"--job-dir=gs://test/job_dir",
]
)
assert args.task == "language_model"
assert args.hp_tune == "True"
assert args.model_name == "test-model"
assert args.num_train_epochs == 20

assert args.tokenizer_dir.bucket.name == "test"
Expand Down
9 changes: 8 additions & 1 deletion src/ainu_lm_trainer/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
tokenizer(job_dir=args.job_dir)

if args.task == "language_model":
language_model(job_dir=args.job_dir, tokenizer_blob=args.tokenizer_blob)
language_model(
job_dir=args.job_dir,
tokenizer_blob=args.tokenizer_dir,
num_train_epochs=args.num_train_epochs,
hypertune_enabled=args.hp_tune,
tensorboard_id=args.tensorboard_id,
tensorboard_experiment_name=args.tensorboard_experiment_display_name,
)

if args.task == "cache":
cache()
28 changes: 23 additions & 5 deletions src/ainu_lm_trainer/app/task_language_model.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
import os
from pathlib import Path
from typing import Optional

from datasets import load_dataset
from google.cloud import storage
from google.cloud import aiplatform, storage
from google.cloud.storage import Blob

from ..models import JobDir
from ..trainers import RobertaTrainer
from ..trainers import RobertaTrainer, RobertaTrainerConfig


def language_model(job_dir: JobDir, tokenizer_blob: Blob) -> None:
def language_model(
job_dir: JobDir,
tokenizer_blob: Blob,
num_train_epochs: int,
hypertune_enabled: Optional[bool] = None,
tensorboard_id: Optional[str] = None,
tensorboard_experiment_name: Optional[str] = None,
) -> None:
aiplatform.init()

client = storage.Client()
dataset = load_dataset("aynumosir/ainu-corpora", split="data")
dataset = dataset.map(lambda example: {"text": example["sentence"]})
Expand All @@ -26,9 +36,17 @@ def language_model(job_dir: JobDir, tokenizer_blob: Blob) -> None:
# Create output directory
output_dir = Path("/tmp/ainu-lm-trainer/lm")
output_dir.mkdir(parents=True, exist_ok=True)
trainer = RobertaTrainer(
dataset, tokenizer_name_or_dir=tokenizer_dir, output_dir=output_dir

config = RobertaTrainerConfig(
num_train_epochs=num_train_epochs,
tokenizer_name_or_dir=tokenizer_dir,
output_dir=output_dir,
hypertune_enabled=hypertune_enabled,
tensorboard_id=tensorboard_id,
tensorboard_experiment_name=tensorboard_experiment_name,
)

trainer = RobertaTrainer(dataset, config=config)
trainer.train()

paths = [
Expand Down
1 change: 1 addition & 0 deletions src/ainu_lm_trainer/trainers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .byte_level_bpe_tokenizer_trainer import ByteLevelBPETokenizerTrainer
from .roberta_trainer import RobertaTrainer
from .roberta_trainer_config import RobertaTrainerConfig
Loading

0 comments on commit 76a69cc

Please sign in to comment.