Skip to content

Commit

Permalink
Merge pull request #231 from Shreyanand/umain
Browse files Browse the repository at this point in the history
Rewrite training component using kubeflow-training library
  • Loading branch information
HumairAK authored Jan 24, 2025
2 parents c0bc6ed + 7c50e94 commit 555d222
Show file tree
Hide file tree
Showing 5 changed files with 484 additions and 670 deletions.
9 changes: 6 additions & 3 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from training import (
data_processing_op,
knowledge_processed_data_to_artifact_op,
pytorchjob_manifest_op,
pytorch_job_launcher_op,
skills_processed_data_to_artifact_op,
)
from utils import (
Expand All @@ -36,6 +36,7 @@
pvc_to_model_op,
pvc_to_mt_bench_op,
)
from utils.consts import RHELAI_IMAGE

TEACHER_CONFIG_MAP = "teacher-server"
TEACHER_SECRET = "teacher-server"
Expand Down Expand Up @@ -264,12 +265,13 @@ def ilab_pipeline(
# Training 1
# Using pvc_create_task.output as PyTorchJob name since dsl.PIPELINE_* global variables do not template/work in KFP v2
# https://github.com/kubeflow/pipelines/issues/10453
training_phase_1 = pytorchjob_manifest_op(
training_phase_1 = pytorch_job_launcher_op(
model_pvc_name=model_pvc_task.output,
input_pvc_name=sdg_input_pvc_task.output,
name_suffix=sdg_input_pvc_task.output,
output_pvc_name=output_pvc_task.output,
phase_num=1,
base_image=RHELAI_IMAGE,
nproc_per_node=train_nproc_per_node,
nnodes=train_nnodes,
num_epochs=train_num_epochs_phase_1,
Expand All @@ -284,12 +286,13 @@ def ilab_pipeline(
training_phase_1.set_caching_options(False)

#### Train 2
training_phase_2 = pytorchjob_manifest_op(
training_phase_2 = pytorch_job_launcher_op(
model_pvc_name=model_pvc_task.output,
input_pvc_name=sdg_input_pvc_task.output,
name_suffix=sdg_input_pvc_task.output,
output_pvc_name=output_pvc_task.output,
phase_num=2,
base_image=RHELAI_IMAGE,
nproc_per_node=train_nproc_per_node,
nnodes=train_nnodes,
num_epochs=train_num_epochs_phase_2,
Expand Down
Loading

0 comments on commit 555d222

Please sign in to comment.