diff --git a/caffe2/core/net_simple.cc b/caffe2/core/net_simple.cc index 8f4a3feca5b541..15aa52aec5a8b9 100644 --- a/caffe2/core/net_simple.cc +++ b/caffe2/core/net_simple.cc @@ -128,162 +128,195 @@ vector SimpleNet::TEST_Benchmark( << ". Iters per second: " << 1000.0 * main_runs / millis << std::endl; } - vector time_per_op(operators_.size(), 0); - vector flops_per_op; - vector memory_bytes_read_per_op; - vector memory_bytes_written_per_op; - vector param_bytes_per_op; - CaffeMap time_per_op_type; - CaffeMap flops_per_op_type; - CaffeMap memory_bytes_read_per_op_type; - CaffeMap memory_bytes_written_per_op_type; - CaffeMap param_bytes_per_op_type; + + auto operators = GetOperators(); + auto results = IndividualMetrics(operators); if (run_individual) { for (int i = 0; i < main_runs; ++i) { - for (auto& op : operators_) { - op->ResetEvent(); - } - int idx = 0; - for (auto& op : operators_) { - const string& op_type = op->debug_def().type(); - if (i == 0) { // Gather flops on the first run. - auto* schema = OpSchemaRegistry::Schema(op_type); - if (schema && schema->HasCostInferenceFunction()) { - vector shapes = op->InputTensorShapes(); - - auto all_good_shapes = std::accumulate( - shapes.begin(), - shapes.end(), - true, - [](bool acc, const TensorShape& shape) { - return acc && !shape.unknown_shape(); - }); - OpSchema::Cost cost; - if (all_good_shapes) { - cost = schema->InferCost(op->debug_def(), shapes); - } + results.RunOpsWithProfiling(); + } + results.PrintOperatorProfilingResults(); + } + // We will reuse time_per_op to return the result of BenchmarkNet. + std::vector time_per_op(results.GetTimePerOp()); + for (size_t i = 0; i < time_per_op.size(); ++i) { + time_per_op[i] /= main_runs; + } + if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) { + time_per_op.insert(time_per_op.begin(), millis / main_runs); + } + return time_per_op; +} - flops_per_op.emplace_back(cost.flops); - memory_bytes_read_per_op.emplace_back(cost.bytes_read); - memory_bytes_written_per_op.emplace_back(cost.bytes_written); - param_bytes_per_op.emplace_back(cost.params_bytes); +void IndividualMetrics::RunOpsWithProfiling() { + int idx = 0; + Timer timer; + for (auto* op : operators_) { + const string& op_type = op->debug_def().type(); + if (main_runs_ == 0) { // Gather flops on the first run. + auto* schema = OpSchemaRegistry::Schema(op_type); + if (schema && schema->HasCostInferenceFunction()) { + vector shapes = op->InputTensorShapes(); - flops_per_op_type[op_type] += cost.flops; - memory_bytes_read_per_op_type[op_type] += cost.bytes_read; - memory_bytes_written_per_op_type[op_type] += cost.bytes_written; - param_bytes_per_op_type[op_type] += cost.params_bytes; - } else { - flops_per_op.emplace_back(0); - memory_bytes_read_per_op.emplace_back(0); - memory_bytes_written_per_op.emplace_back(0); - param_bytes_per_op.emplace_back(0); - } + auto all_good_shapes = std::accumulate( + shapes.begin(), + shapes.end(), + true, + [](bool acc, const TensorShape& shape) { + return acc && !shape.unknown_shape(); + }); + OpSchema::Cost cost; + if (all_good_shapes) { + cost = schema->InferCost(op->debug_def(), shapes); } - timer.Start(); - CAFFE_ENFORCE( - op->Run(), - "operator ", - op->debug_def().name(), - "(", - op_type, - ") has failed."); - float spent = timer.MilliSeconds(); - time_per_op[idx] += spent; - time_per_op_type[op_type] += spent; - ++idx; + + flops_per_op.emplace_back(cost.flops); + memory_bytes_read_per_op.emplace_back(cost.bytes_read); + memory_bytes_written_per_op.emplace_back(cost.bytes_written); + param_bytes_per_op.emplace_back(cost.params_bytes); + + flops_per_op_type[op_type] += cost.flops; + memory_bytes_read_per_op_type[op_type] += cost.bytes_read; + memory_bytes_written_per_op_type[op_type] += cost.bytes_written; + param_bytes_per_op_type[op_type] += cost.params_bytes; + } else { + flops_per_op.emplace_back(0); + memory_bytes_read_per_op.emplace_back(0); + memory_bytes_written_per_op.emplace_back(0); + param_bytes_per_op.emplace_back(0); } } - size_t idx = 0; - for (auto& op : operators_) { - const string& op_type = op->debug_def().type(); - const string& print_name = - (op->debug_def().name().size() - ? op->debug_def().name() - : (op->debug_def().output_size() ? op->debug_def().output(0) - : "NO_OUTPUT")); - std::stringstream flops_str; - if (idx < flops_per_op.size() && flops_per_op[idx]) { - flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, " - << to_string( - 1.0e-6 * flops_per_op[idx] / time_per_op[idx] * - main_runs) - << " GFLOPS)"; - } - std::stringstream memory_bytes_read_str; - if (idx < memory_bytes_read_per_op.size() && - memory_bytes_read_per_op[idx]) { - memory_bytes_read_str - << " (" << to_string(1.0e-6 * memory_bytes_read_per_op[idx]) - << " MB)"; - } - std::stringstream memory_bytes_written_str; - if (idx < memory_bytes_written_per_op.size() && - memory_bytes_written_per_op[idx]) { - memory_bytes_written_str - << " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx]) - << " MB)"; - } - std::stringstream param_bytes_str; - if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) { - param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx]) - << " MB)"; - } - std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type - << ") " << time_per_op[idx] / main_runs << " ms/iter" - << flops_str.str() << memory_bytes_written_str.str() - << param_bytes_str.str() << std::endl; - ++idx; + timer.Start(); + CAFFE_ENFORCE( + op->Run(), + "operator ", + op->debug_def().name(), + "(", + op_type, + ") has failed."); + float spent = timer.MilliSeconds(); + time_per_op[idx] += spent; + time_per_op_type[op_type] += spent; + ++idx; + } + ++main_runs_; +} + +void IndividualMetrics::PrintOperatorProfilingResults() { + for (auto& op : operators_) { + op->ResetEvent(); + } + size_t idx = 0; + for (auto& op : operators_) { + const string& op_type = op->debug_def().type(); + num_ops_per_op_type_[op_type]++; + const string& print_name = + (op->debug_def().name().size() + ? op->debug_def().name() + : (op->debug_def().output_size() ? op->debug_def().output(0) + : "NO_OUTPUT")); + std::stringstream flops_str; + if (idx < flops_per_op.size() && flops_per_op[idx]) { + flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, " + << to_string( + 1.0e-6 * flops_per_op[idx] / time_per_op[idx] * + main_runs_) + << " GFLOPS)"; + } + std::stringstream memory_bytes_read_str; + if (idx < memory_bytes_read_per_op.size() && + memory_bytes_read_per_op[idx]) { + memory_bytes_read_str << " (" + << to_string(1.0e-6 * memory_bytes_read_per_op[idx]) + << " MB)"; } - const std::vector metric({"Time", - "FLOP", - "Feature Memory Read", - "Feature Memory Written", - "Parameter Memory"}); - const std::vector normalizer( - {1.0 / main_runs, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6}); - const std::vector unit({"ms", "GFLOP", "MB", "MB", "MB"}); + std::stringstream memory_bytes_written_str; + if (idx < memory_bytes_written_per_op.size() && + memory_bytes_written_per_op[idx]) { + memory_bytes_written_str + << " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx]) + << " MB)"; + } + std::stringstream param_bytes_str; + if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) { + param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx]) + << " MB)"; + } + std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type + << ") " << time_per_op[idx] / main_runs_ << " ms/iter" + << flops_str.str() << memory_bytes_written_str.str() + << param_bytes_str.str() << std::endl; + ++idx; + } + const std::vector metric( + {"Time", + "FLOP", + "Feature Memory Read", + "Feature Memory Written", + "Parameter Memory"}); + const std::vector normalizer( + {1.0 / main_runs_, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6}); + const std::vector unit({"ms", "GFLOP", "MB", "MB", "MB"}); - std::vector*> metric_per_op_type_vec_vec; - metric_per_op_type_vec_vec.emplace_back(&time_per_op_type); - metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type); - metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type); - metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type); - metric_per_op_type_vec_vec.emplace_back(¶m_bytes_per_op_type); - for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) { + std::vector*> metric_per_op_type_vec_vec; + metric_per_op_type_vec_vec.emplace_back(&time_per_op_type); + metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type); + metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type); + metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type); + metric_per_op_type_vec_vec.emplace_back(¶m_bytes_per_op_type); + for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) { + auto* item = metric_per_op_type_vec_vec[i]; + std::vector> metric_per_op_type_vec( + (*item).begin(), (*item).end()); + std::sort( + metric_per_op_type_vec.begin(), + metric_per_op_type_vec.end(), + PairLargerThan); + float total_metric = 0.; + for (const auto& op_item : metric_per_op_type_vec) { + total_metric += op_item.second * normalizer[i]; + } + if (total_metric > 0.) { std::cout << metric[i] << " per operator type:" << std::endl; - auto* item = metric_per_op_type_vec_vec[i]; - std::vector> metric_per_op_type_vec( - (*item).begin(), (*item).end()); - std::sort( - metric_per_op_type_vec.begin(), - metric_per_op_type_vec.end(), - PairLargerThan); - float total_metric = 0.; - for (const auto& op_item : metric_per_op_type_vec) { - total_metric += op_item.second * normalizer[i]; - } - for (const auto& op_item : metric_per_op_type_vec) { - float percent = 0.; - if (total_metric > 0.) { - percent = (100.0 * op_item.second * normalizer[i] / total_metric); - } - std::cout << std::setw(15) << std::setfill(' ') - << op_item.second * normalizer[i] << " " << unit[i] << ". " - << std::setw(10) << std::setfill(' ') << percent << "%. " - << op_item.first << std::endl; + } + for (const auto& op_item : metric_per_op_type_vec) { + float percent = 0.; + const string& op = op_item.first; + float value = op_item.second; + if (total_metric > 0.) { + percent = (100.0 * value * normalizer[i] / total_metric); } + std::cout << std::setw(15) << std::setfill(' ') << value * normalizer[i] + << " " << unit[i] << ". " << std::setw(10) << std::setfill(' ') + << percent << "%. " << op << " (" << num_ops_per_op_type_[op] + << " ops)" << std::endl; + } + if (total_metric > 0.) { std::cout << std::setw(15) << std::setfill(' ') << total_metric << " " << unit[i] << " in Total" << std::endl; } + if (i == 0) { + if (setup_time > 0) { + std::cout << "BlackBoxPredictor setup time: " + << setup_time * normalizer[i] << " " << unit[i] << "\n"; + } + if (memory_alloc_time > 0) { + std::cout << "Memory allocation time: " + << memory_alloc_time * normalizer[i] << " " << unit[i] + << "\n"; + } + if (memory_dealloc_time > 0) { + std::cout << "Memory deallocation time: " + << memory_dealloc_time * normalizer[i] << " " << unit[i] + << std::endl; + } + if (output_dealloc_time > 0) { + std::cout << "Output deallocation time: " + << output_dealloc_time * normalizer[i] << " " << unit[i] + << std::endl; + } + } } - // We will reuse time_per_op to return the result of BenchmarkNet. - for (size_t i = 0; i < time_per_op.size(); ++i) { - time_per_op[i] /= main_runs; - } - if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) { - time_per_op.insert(time_per_op.begin(), millis / main_runs); - } - return time_per_op; } REGISTER_NET(simple, SimpleNet); diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h index c6b25eab4c57cf..b3abcbf4ac1de9 100644 --- a/caffe2/core/net_simple.h +++ b/caffe2/core/net_simple.h @@ -13,6 +13,46 @@ namespace caffe2 { +struct IndividualMetrics { + public: + explicit IndividualMetrics(const std::vector& operators) + : main_runs_(0), operators_(operators) { + const auto num_ops = operators_.size(); + time_per_op.resize(num_ops, 0.0); + } + // run ops while collecting profiling results + void RunOpsWithProfiling(); + + // print out profiling results + void PrintOperatorProfilingResults(); + + const vector& GetTimePerOp() { + return time_per_op; + } + + float setup_time{0.0}; + float memory_alloc_time{0.0}; + float memory_dealloc_time{0.0}; + float output_dealloc_time{0.0}; + + private: + int main_runs_; + const std::vector& operators_; + + vector time_per_op; + vector flops_per_op; + vector memory_bytes_read_per_op; + vector memory_bytes_written_per_op; + vector param_bytes_per_op; + + CaffeMap num_ops_per_op_type_; + CaffeMap time_per_op_type; + CaffeMap flops_per_op_type; + CaffeMap memory_bytes_read_per_op_type; + CaffeMap memory_bytes_written_per_op_type; + CaffeMap param_bytes_per_op_type; +}; + // This is the very basic structure you need to run a network - all it // does is simply to run everything in sequence. If you want more fancy control // such as a DAG-like execution, check out other better net implementations.