Skip to content

Commit

Permalink
Updates envs and configs for MLPerf tests on Cirrus
Browse files Browse the repository at this point in the history
user name committed Sep 24, 2024
1 parent 058ba46 commit 49d7188
Showing 4 changed files with 94 additions and 48 deletions.
45 changes: 31 additions & 14 deletions tests/mlperf/cosmoflow/gpu.py
Original file line number Diff line number Diff line change
@@ -17,25 +17,16 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck):

num_tasks = None
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
lbs = parameter([2])


time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Setup environment"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
@@ -52,6 +43,18 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]


elif self.current_system.name in ["cirrus"]:
self.executable = "python"
@@ -60,14 +63,28 @@ def setup_systems(self):
}
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml",
"--device",
"cuda",
"--data-dir",
"/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini "
"-lbs",
"2",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]


@run_before("run")
def set_task_distribution(self):
41 changes: 27 additions & 14 deletions tests/mlperf/deepcam/gpu.py
Original file line number Diff line number Diff line change
@@ -17,25 +17,15 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck):

num_tasks = None
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
# lbs = parameter([2])

time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Setup environment"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
@@ -52,6 +42,18 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]


elif self.current_system.name in ["cirrus"]:
self.executable = "python"
@@ -60,14 +62,25 @@ def setup_systems(self):
}
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"2",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]

@run_before("run")
def set_task_distribution(self):
54 changes: 35 additions & 19 deletions tests/mlperf/resnet50/gpu.py
Original file line number Diff line number Diff line change
@@ -17,27 +17,15 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck):

num_tasks = None
num_gpus = parameter([4])
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
# lbs = parameter([8])

time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Environment setup"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
"--t_subset_size",
"2048",
"--v_subset_size",
"512",
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
@@ -54,25 +42,53 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
"--t_subset_size",
"2048",
"--v_subset_size",
"512",
]


elif self.current_system.name in ["cirrus"]:
self.executable_opts[2] = (
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
)
# self.executable_opts[2] = (
# "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
# )
self.executable = "python"
self.extra_resources = {
"qos": {"qos": "gpu"},
}
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"2",
"--t_subset_size",
"2048",
"--v_subset_size",
"512",
]


@run_before("run")
def set_task_distribution(self):
2 changes: 1 addition & 1 deletion tests/mlperf/resnet50/graphcore.py
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@


@rfm.simple_test
class ResNetGPUServiceBenchmark(ResNet50BaseCheck):
class ResNetGPUServiceGraphCoreBenchmark(ResNet50BaseCheck):
"""Resnet50 test class for graphcore"""

valid_prog_environs = ["*"]

0 comments on commit 49d7188

Please sign in to comment.