diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py new file mode 100644 index 0000000000000..2a17d41774a42 --- /dev/null +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -0,0 +1,358 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""This profiler result processor print out the kernel time spent on each Node of the model. +Example of importing profile result file from onnxruntime_perf_test: + python profile_result_processor.py --input profile_2021-10-25_12-02-41.json +""" + +import argparse +import json + +_NODES_TYPE_CONTAINING_SUBGRAPH = frozenset(("Scan", "Loop", "If")) + + +def parse_arguments(argv=None): + parser = argparse.ArgumentParser() + + parser.add_argument( + "-i", + "--input", + required=False, + type=str, + help="Set the input file for reading the profile results", + ) + + parser.add_argument( + "--threshold", + required=False, + type=float, + default=0.01, + help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.", + ) + + parser.add_argument( + "--provider", + required=False, + type=str, + default="cuda", + help="Execution provider to use", + ) + + parser.add_argument( + "--kernel_time_only", + required=False, + action="store_true", + help="Only include the kernel time and no fence time", + ) + + parser.set_defaults(kernel_time_only=False) + + parser.add_argument("-v", "--verbose", required=False, action="store_true") + parser.set_defaults(verbose=False) + + return parser.parse_args(argv) + + +def load_profile_json(profile_file): + print(f"loading profile output {profile_file} ...") + + with open(profile_file) as opened_file: + sess_time = json.load(opened_file) + + assert isinstance(sess_time, list) + return sess_time + + +def parse_kernel_results(sess_time, threshold=0): + """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. + + Args: + sess_time (List[Dict]): profile data + threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. + + Returns: + List[str]: lines of string for output. + """ + kernel_name_to_op_name = {} + kernel_time = {} + kernel_freq = {} + total = 0 + session_init = False + for item in sess_time: + # Skip all MemcpyHostToDevice before session_initialization + if item["cat"] == "Session" and item["name"] == "session_initialization": + session_init = True + if not session_init: + continue + + if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]: + kernel_name = item["name"] + + op_name = item["args"]["op_name"] + if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH: + continue + + # Handle MemcpyHostToDevice and MemcpyDeviceToHost here + if not op_name: + op_name = f"({kernel_name})" + + if kernel_name in kernel_time: + kernel_time[kernel_name] += item["dur"] + kernel_freq[kernel_name] += 1 + else: + kernel_time[kernel_name] = item["dur"] + kernel_freq[kernel_name] = 1 + kernel_name_to_op_name[kernel_name] = op_name + + total += item["dur"] + + if not kernel_time: + return ["No kernel record found!"] + + # Output items with run time ratio > thresholds, and sorted by duration in the descending order. + lines = [] + lines.append(f"\nTop expensive kernels with Time% >= {threshold * 100:.2f}:") + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel") + for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True): + ratio = duration / total + if ratio < threshold: + continue + + calls = kernel_freq[kernel_name] + avg_time = duration / float(calls) + lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}") + + # Group by operator + op_time = {} + for kernel_name, op_name in kernel_name_to_op_name.items(): + duration = kernel_time[kernel_name] + if op_name in op_time: + op_time[op_name] += duration + else: + op_time[op_name] = duration + + lines.append("\nGroup kernel time by operator:") + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tOperator") + for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True): + ratio = duration / total + lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}") + + return lines + + +def parse_node_results(sess_time, kernel_time_only=False, threshold=0): + """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. + + Args: + sess_time (List[Dict]): profile data + kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. + threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. + + Returns: + List[str]: lines of string for output. + """ + node_name_list = [] + node_time = {} + node_freq = {} + node_provider = {} + total = 0 + for item in sess_time: + if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: + node_name = ( + item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "") + ) + + if "provider" in item["args"]: + if item["args"]["provider"] == "CPUExecutionProvider": + device = "CPU" + elif item["args"]["provider"] == "CUDAExecutionProvider": + device = "CUDA" + elif item["args"]["provider"] == "DmlExecutionProvider": + device = "DML" + + if node_name not in node_provider: + node_provider[node_name] = device + else: + assert node_provider[node_name] == device + elif kernel_time_only: + continue + + op_name = item["args"]["op_name"] + if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH: + continue + + if node_name in node_time: + node_time[node_name] += item["dur"] + node_freq[node_name] += 1 + else: + node_time[node_name] = item["dur"] + node_freq[node_name] = 1 + node_name_list.append(node_name) + + total += item["dur"] + + # Output items in the original order. + lines = [ + "\nNodes in the original order:", + "-" * 64, + "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode", + ] + before_percentage = 0.0 + for node_name in node_name_list: + duration = node_time[node_name] + calls = node_freq[node_name] + avg_time = duration / float(calls) + percentage = (duration / total) * 100.0 + provider = node_provider.get(node_name, "") + before_percentage += percentage + lines.append( + f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}" + ) + + # Output items with run time ratio > thresholds, and sorted by duration in the descending order. + lines.append(f"\nTop expensive nodes with Time% >= {threshold * 100:.2f}:") + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode") + for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True): + ratio = duration / total + if ratio < threshold: + continue + + calls = node_freq[node_name] + avg_time = duration / float(calls) + percentage = (duration / total) * 100.0 + provider = node_provider.get(node_name, "") + lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}") + + return lines + + +def group_node_results(sess_time): + """Group results by operator name. + + Args: + sess_time (List[Dict]): profile data + + Returns: + List[str]: lines of string for output. + """ + op_kernel_time = {} + op_kernel_records = {} + total_kernel_time = 0 + + provider_op_kernel_time = {} + provider_op_kernel_records = {} + provider_kernel_time = {} + + op_fence_time = {} + total_fence_time = 0 + + provider_counter = {} + for item in sess_time: + if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: + op_name = item["args"]["op_name"] + + # TODO: shall we have a separated group for nodes with subgraph? + if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH: + continue + + if "provider" not in item["args"]: + if "fence" in item["name"]: + if op_name in op_fence_time: + op_fence_time[op_name] += item["dur"] + else: + op_fence_time[op_name] = item["dur"] + total_fence_time += item["dur"] + continue + + provider = item["args"].get("provider", "") + if provider in provider_counter: + provider_counter[provider] += 1 + else: + provider_counter[provider] = 1 + + key = f"{provider}:{op_name}" + if key in provider_op_kernel_time: + provider_op_kernel_time[key] += item["dur"] + provider_op_kernel_records[key] += 1 + else: + provider_op_kernel_time[key] = item["dur"] + provider_op_kernel_records[key] = 1 + + if provider in provider_kernel_time: + provider_kernel_time[provider] += item["dur"] + else: + provider_kernel_time[provider] = item["dur"] + + if op_name in op_kernel_time: + op_kernel_time[op_name] += item["dur"] + op_kernel_records[op_name] += 1 + else: + op_kernel_time[op_name] = item["dur"] + op_kernel_records[op_name] = 1 + + total_kernel_time += item["dur"] + + lines = ["", "Grouped by operator"] + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator") + for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True): + fence_time = op_fence_time.get(op_name, 0) + kernel_time_ratio = kernel_time / total_kernel_time + total_time = kernel_time + fence_time + time_ratio = total_time / (total_kernel_time + total_fence_time) + kernel_calls = op_kernel_records[op_name] + avg_kernel_time = kernel_time / kernel_calls + lines.append( + f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}" + ) + + lines += ["", "Grouped by provider + operator"] + lines.append("-" * 64) + lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator") + for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True): + parts = key.split(":") + provider = parts[0] + op_name = parts[1] + short_ep = provider.replace("ExecutionProvider", "") + calls = provider_op_kernel_records[key] + avg_kernel_time = kernel_time / calls + provider_time_ratio = kernel_time / provider_kernel_time[provider] + lines.append( + f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}" + ) + + return lines + + +def process_results(profile_file, args): + profile_records = load_profile_json(profile_file) + + lines = parse_kernel_results(profile_records, args.threshold) + + lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold) + + lines += group_node_results(profile_records) + + return lines + + +if __name__ == "__main__": + arguments = parse_arguments() + print("Arguments", arguments) + + from benchmark_helper import setup_logger + + setup_logger(arguments.verbose) + + profile_file = arguments.input + + results = process_results(profile_file, arguments) + + for line in results: + print(line) diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py index 2306b579f92fe..71249a1121c60 100644 --- a/onnxruntime/python/tools/transformers/profiler.py +++ b/onnxruntime/python/tools/transformers/profiler.py @@ -1,5 +1,4 @@ import argparse -import json import os import numpy @@ -14,8 +13,6 @@ python profiler.py --input profile_2021-10-25_12-02-41.json """ -NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"] - def parse_arguments(argv=None): parser = argparse.ArgumentParser() @@ -176,283 +173,6 @@ def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_n return profile_file -def load_profile_json(profile_file): - print(f"loading profile output {profile_file} ...") - - with open(profile_file) as opened_file: - sess_time = json.load(opened_file) - - assert isinstance(sess_time, list) - return sess_time - - -def parse_kernel_results(sess_time, threshold=0): - """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. - - Args: - sess_time (List[Dict]): profile data - kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. - threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. - - Returns: - List[str]: lines of string for output. - """ - kernel_name_to_op_name = {} - kernel_time = {} - kernel_freq = {} - total = 0 - session_init = False - for item in sess_time: - # Skip all MemcpyHostToDevice before session_initialization - if item["cat"] == "Session" and item["name"] == "session_initialization": - session_init = True - if not session_init: - continue - - if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]: - kernel_name = item["name"] - - op_name = item["args"]["op_name"] - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: - continue - - # Handle MemcpyHostToDevice and MemcpyDeviceToHost here - if not op_name: - op_name = f"({kernel_name})" - - if kernel_name in kernel_time: - kernel_time[kernel_name] += item["dur"] - kernel_freq[kernel_name] += 1 - else: - kernel_time[kernel_name] = item["dur"] - kernel_freq[kernel_name] = 1 - kernel_name_to_op_name[kernel_name] = op_name - - total += item["dur"] - - if not kernel_time: - return ["No kernel record found!"] - - # Output items with run time ratio > thresholds, and sorted by duration in the descending order. - lines = [] - lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:") - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel") - for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True): - ratio = duration / total - if ratio < threshold: - continue - - calls = kernel_freq[kernel_name] - avg_time = duration / float(calls) - lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}") - - # Group by operator - op_time = {} - for kernel_name, op_name in kernel_name_to_op_name.items(): - duration = kernel_time[kernel_name] - if op_name in op_time: - op_time[op_name] += duration - else: - op_time[op_name] = duration - - lines.append("\nGroup kernel time by operator:") - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tOperator") - for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True): - ratio = duration / total - lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}") - - return lines - - -def parse_node_results(sess_time, kernel_time_only=False, threshold=0): - """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. - - Args: - sess_time (List[Dict]): profile data - kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. - threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. - - Returns: - List[str]: lines of string for output. - """ - node_name_list = [] - node_time = {} - node_freq = {} - node_provider = {} - total = 0 - for item in sess_time: - if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: - node_name = ( - item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "") - ) - - if "provider" in item["args"]: - if item["args"]["provider"] == "CPUExecutionProvider": - device = "CPU" - elif item["args"]["provider"] == "CUDAExecutionProvider": - device = "CUDA" - elif item["args"]["provider"] == "DmlExecutionProvider": - device = "DML" - - if node_name not in node_provider: - node_provider[node_name] = device - else: - assert node_provider[node_name] == device - elif kernel_time_only: - continue - - op_name = item["args"]["op_name"] - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: - continue - - if node_name in node_time: - node_time[node_name] += item["dur"] - node_freq[node_name] += 1 - else: - node_time[node_name] = item["dur"] - node_freq[node_name] = 1 - node_name_list.append(node_name) - - total += item["dur"] - - # Output items in the original order. - lines = [ - "\nNodes in the original order:", - "-" * 64, - "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode", - ] - before_percentage = 0.0 - for node_name in node_name_list: - duration = node_time[node_name] - calls = node_freq[node_name] - avg_time = duration / float(calls) - percentage = (duration / total) * 100.0 - provider = node_provider.get(node_name, "") - before_percentage += percentage - lines.append( - f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}" - ) - - # Output items with run time ratio > thresholds, and sorted by duration in the descending order. - lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:") - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode") - for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True): - ratio = duration / total - if ratio < threshold: - continue - - calls = node_freq[node_name] - avg_time = duration / float(calls) - percentage = (duration / total) * 100.0 - provider = node_provider.get(node_name, "") - lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}") - - return lines - - -def group_node_results(sess_time, kernel_time_only, use_gpu): - """Group results by operator name. - - Args: - sess_time (List[Dict]): profile data - kernel_time_only (bool): Only include items for kernel time. - use_gpu (bool): GPU is used in profiling or not. - - Returns: - List[str]: lines of string for output. - """ - op_kernel_time = {} - op_kernel_records = {} - total_kernel_time = 0 - - provider_op_kernel_time = {} - provider_op_kernel_records = {} - provider_kernel_time = {} - - op_fence_time = {} - total_fence_time = 0 - - provider_counter = {} - for item in sess_time: - if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: - op_name = item["args"]["op_name"] - - # TODO: shall we have a separated group for nodes with subgraph? - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: - continue - - if "provider" not in item["args"]: - if "fence" in item["name"]: - if op_name in op_fence_time: - op_fence_time[op_name] += item["dur"] - else: - op_fence_time[op_name] = item["dur"] - total_fence_time += item["dur"] - continue - - provider = item["args"].get("provider", "") - if provider in provider_counter: - provider_counter[provider] += 1 - else: - provider_counter[provider] = 1 - - key = f"{provider}:{op_name}" - if key in provider_op_kernel_time: - provider_op_kernel_time[key] += item["dur"] - provider_op_kernel_records[key] += 1 - else: - provider_op_kernel_time[key] = item["dur"] - provider_op_kernel_records[key] = 1 - - if provider in provider_kernel_time: - provider_kernel_time[provider] += item["dur"] - else: - provider_kernel_time[provider] = item["dur"] - - if op_name in op_kernel_time: - op_kernel_time[op_name] += item["dur"] - op_kernel_records[op_name] += 1 - else: - op_kernel_time[op_name] = item["dur"] - op_kernel_records[op_name] = 1 - - total_kernel_time += item["dur"] - - lines = ["", "Grouped by operator"] - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator") - for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True): - fence_time = op_fence_time.get(op_name, 0) - kernel_time_ratio = kernel_time / total_kernel_time - total_time = kernel_time + fence_time - time_ratio = total_time / (total_kernel_time + total_fence_time) - kernel_calls = op_kernel_records[op_name] - avg_kernel_time = kernel_time / kernel_calls - lines.append( - f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}" - ) - - lines += ["", "Grouped by provider + operator"] - lines.append("-" * 64) - lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator") - for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True): - parts = key.split(":") - provider = parts[0] - op_name = parts[1] - short_ep = provider.replace("ExecutionProvider", "") - calls = provider_op_kernel_records[key] - avg_kernel_time = kernel_time / calls - provider_time_ratio = kernel_time / provider_kernel_time[provider] - lines.append( - f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}" - ) - - return lines - - def get_dim_from_type_proto(dim): return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None # noqa: E721 @@ -639,18 +359,6 @@ def create_longformer_inputs(onnx_model, batch_size, sequence_length, global_len return all_inputs -def process_results(profile_file, args): - profile_records = load_profile_json(profile_file) - - lines = parse_kernel_results(profile_records, args.threshold) - - lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold) - - lines += group_node_results(profile_records, args.kernel_time_only, args.use_gpu) - - return lines - - def run(args): num_threads = args.thread_num if args.thread_num > 0 else psutil.cpu_count(logical=False) @@ -718,6 +426,7 @@ def run(args): profile_file = run(arguments) else: profile_file = arguments.input + from profile_result_processor import process_results results = process_results(profile_file, arguments)