Skip to content

Commit

Permalink
test_examples: use baseline fixture
Browse files Browse the repository at this point in the history
Signed-off-by: U. Artie Eoff <[email protected]>
  • Loading branch information
uartie committed Feb 21, 2025
1 parent c680cc9 commit f087196
Show file tree
Hide file tree
Showing 37 changed files with 514 additions and 66 deletions.
440 changes: 440 additions & 0 deletions tests/baselines/fixture/tests/test_examples.json

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"squad": {
"num_train_epochs": 2,
"eval_batch_size": 4,
Expand Down Expand Up @@ -59,4 +59,4 @@
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"squad": {
"num_train_epochs": 1,
"eval_batch_size": 2,
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"squad": {
"num_train_epochs": 1,
"eval_batch_size": 8,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"wikitext": {
"num_train_epochs": 3,
"eval_batch_size": 4,
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"ydshieh/coco_dataset_script": {
"num_train_epochs": 1,
"eval_batch_size": 64,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"squad": {
"num_train_epochs": 1,
"eval_batch_size": 8,
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"wikitext": {
"num_train_epochs": 2,
"eval_batch_size": 4,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"wikitext": {
"num_train_epochs": 2,
"eval_batch_size": 4,
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"databricks/databricks-dolly-15k": {
"num_train_epochs": 1,
"eval_batch_size": 2,
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"squad": {
"num_train_epochs": 1,
"eval_batch_size": 8,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"squad": {
"num_train_epochs": 1,
"eval_batch_size": 8,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"cifar10": {
"num_train_epochs": 1,
"eval_batch_size": 64,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"cnn_dailymail": {
"num_train_epochs": 1,
"eval_batch_size": 4,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"cifar10": {
"num_train_epochs": 1,
"eval_batch_size": 64,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"common_language": {
"num_train_epochs": 10,
"eval_batch_size": 64,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"regisss/librispeech_asr_for_optimum_habana_ci": {
"num_train_epochs": 2,
"eval_batch_size": 8,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"gaudi": {
"gaudi1": {
"mozilla-foundation/common_voice_11_0": {
"num_train_epochs": 10,
"eval_batch_size": 2,
Expand Down
104 changes: 56 additions & 48 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
# limitations under the License.

import json
import logging
import operator
import os
import pytest
import re
import subprocess
from distutils.util import strtobool
Expand Down Expand Up @@ -54,15 +57,15 @@
)


BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("baselines")
CONFIG_DIRECTORY = Path(__file__).parent.resolve() / Path("configs") / Path("examples")
# Models should reach at least 99% of their baseline accuracy
ACCURACY_PERF_FACTOR = 0.99
# Trainings/Evaluations should last at most 5% longer than the baseline
TIME_PERF_FACTOR = 1.05


IS_GAUDI2 = bool("gaudi2" == OH_DEVICE_CONTEXT)

IS_GAUDI1 = bool("gaudi1" == OH_DEVICE_CONTEXT)

def _get_supported_models_for_script(
models_to_test: Dict[str, List[Tuple[str]]],
Expand Down Expand Up @@ -454,29 +457,28 @@ def test(self):

self._install_requirements(example_script.parent / "requirements.txt")

# collect baseline from <model_name>_eager.json if eager_mode is True
# collect test_config from <model_name>_eager.json if eager_mode is True
if self.EAGER_MODE:
baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager"
config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager"
else:
baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_")
config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_")

path_to_baseline = BASELINE_DIRECTORY / Path(baseline_name).with_suffix(".json")
path_to_config = CONFIG_DIRECTORY / Path(config_name).with_suffix(".json")

with path_to_baseline.open("r") as json_file:
device = "gaudi2" if IS_GAUDI2 else "gaudi"
baseline = json.load(json_file)[device]
with path_to_config.open("r") as json_file:
test_config = json.load(json_file)[OH_DEVICE_CONTEXT]
if isinstance(self.TASK_NAME, list):
for key in self.TASK_NAME:
if key in baseline:
baseline = baseline[key]
if key in test_config:
test_config = test_config[key]
break
if "num_train_epochs" not in baseline:
if "num_train_epochs" not in test_config:
raise ValueError(
f"Couldn't find a baseline associated to any of these tasks: {self.TASK_NAME}."
f"Couldn't find a test config associated to any of these tasks: {self.TASK_NAME}."
)
self.TASK_NAME = key
else:
baseline = baseline[self.TASK_NAME]
test_config = test_config[self.TASK_NAME]

distribution = "single_card"
if multi_card:
Expand Down Expand Up @@ -507,7 +509,7 @@ def test(self):
if fp8 and "llama" in model_name:
env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str(example_script.parent / "ops_bf16.txt")

extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])
extra_command_line_arguments = test_config.get("distribution").get(distribution).get("extra_arguments", [])

if self.EAGER_MODE:
env_variables["PT_HPU_LAZY_MODE"] = "0"
Expand Down Expand Up @@ -569,10 +571,10 @@ def test(self):
gaudi_config_name,
tmp_dir,
task=self.TASK_NAME,
lr=baseline.get("distribution").get(distribution).get("learning_rate"),
train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"),
eval_batch_size=baseline.get("eval_batch_size"),
num_epochs=baseline.get("num_train_epochs"),
lr=test_config.get("distribution").get(distribution).get("learning_rate"),
train_batch_size=test_config.get("distribution").get(distribution).get("train_batch_size"),
eval_batch_size=test_config.get("eval_batch_size"),
num_epochs=test_config.get("num_train_epochs"),
extra_command_line_arguments=extra_command_line_arguments,
)
print(f"\n\nCommand to test: {' '.join(cmd_line[:])}\n")
Expand All @@ -585,7 +587,7 @@ def test(self):
with open(Path(tmp_dir) / "all_results.json") as fp:
results = json.load(fp)
# Ensure performance requirements (accuracy, training time) are met
self.assert_no_regression(results, baseline.get("distribution").get(distribution), model_name)
self.assert_no_regression(results, test_config.get("distribution").get(distribution).get("metrics"), model_name)

# TODO: is a cleanup of the dataset cache needed?
# self._cleanup_dataset_cache()
Expand All @@ -612,17 +614,24 @@ class ExampleTesterBase(TestCase):
DATASET_PARAMETER_NAME = "dataset_name"
DATASET_NAME = None
REGRESSION_METRICS = {
"eval_f1": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
"eval_accuracy": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
"perplexity": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
"eval_rougeLsum": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
"train_runtime": (TestCase.assertLessEqual, TIME_PERF_FACTOR),
"eval_wer": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
"train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
"eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
"eval_f1": (operator.ge, ACCURACY_PERF_FACTOR),
"eval_accuracy": (operator.ge, ACCURACY_PERF_FACTOR),
"perplexity": (operator.le, 2 - ACCURACY_PERF_FACTOR),
"eval_rougeLsum": (operator.ge, ACCURACY_PERF_FACTOR),
"train_runtime": (operator.le, TIME_PERF_FACTOR),
"eval_wer": (operator.le, 2 - ACCURACY_PERF_FACTOR),
"train_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR),
"eval_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR),
}
EAGER_MODE = False

@pytest.fixture(autouse=True)
def _use_(self, baseline):
"""
https://docs.pytest.org/en/stable/how-to/unittest.html#using-autouse-fixtures-and-accessing-other-fixtures
"""
self.baseline = baseline

def _create_command_line(
self,
multi_card: bool,
Expand Down Expand Up @@ -717,20 +726,18 @@ def _install_requirements(self, requirements_filename: Union[str, os.PathLike]):
return_code = p.wait()
self.assertEqual(return_code, 0)

def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
def assert_no_regression(self, results: Dict, metrics: list, model_name: str):
"""
Assert whether all possible performance requirements are met.
Attributes:
results (Dict): results of the run to assess
baseline (Dict): baseline to assert whether or not there is regression
"""
# Gather all the metrics to assess
metrics_to_assess = []
for metric_name in self.REGRESSION_METRICS.keys():
if metric_name in baseline and metric_name in results:
metrics_to_assess.append(metric_name)
# There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM
metrics_to_assess = list(set(self.REGRESSION_METRICS.keys()) & set(metrics) & set(results.keys()))
min_number_metrics = 3

# There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM
if (
self.EXAMPLE_NAME in ["run_clip", "run_bridgetower", "sft", "dpo", "ppo", "reward_modeling"]
or "bloom" in model_name
Expand All @@ -745,25 +752,26 @@ def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
(
f"{len(metrics_to_assess)} asserted metric(s) while at least 3 are expected (throughput + training"
f" time + accuracy). Metrics to assert: {self.REGRESSION_METRICS.keys()}. Metrics received:"
f" {baseline.keys()}"
f" {metrics}"
),
)

# Message to display if one test fails
# This enables to show all the results and baselines even if one test fails before others
failure_message = "\n===== Assessed metrics (measured vs thresholded baseline) =====\n"
for metric_name in metrics_to_assess:
failure_message += f"{metric_name}: {results[metric_name]} vs {self.REGRESSION_METRICS[metric_name][1] * baseline[metric_name]}\n"

# Assess metrics
passed = True
for metric_name in metrics_to_assess:
assert_function, threshold_factor = self.REGRESSION_METRICS[metric_name]
assert_function(
self,
results[metric_name],
threshold_factor * baseline[metric_name],
msg=f"for metric {metric_name}. {failure_message}",
)
fn, threshold = self.REGRESSION_METRICS[metric_name]
def check(actual, ref):
check.msg = f"{metric_name}: {fn.__name__}({actual}, {threshold} * {ref})\n"
return fn(actual, threshold * ref)
check.msg = ""

try:
self.baseline.assertRef(compare=check, context=[OH_DEVICE_CONTEXT], **{metric_name:results[metric_name]})
except Exception as e:
logging.getLogger().error(check.msg)
passed = False

assert passed, f"One or more metrics failed"


class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue"):
Expand Down

0 comments on commit f087196

Please sign in to comment.