Skip to content

Commit

Permalink
[Caffe2] Implement BlackBoxPredictor::BenchmarkIndividualOps (pytorch…
Browse files Browse the repository at this point in the history
…#52903)

Summary:
Pull Request resolved: pytorch#52903

Implement BlackBoxPredictor::BenchmarkIndividualOps so that we can clean up the output tensors properly after each iteration and get more accurate per operator timing.

Add four more metrics to track setup_time, memory_alloc_time, memory_dealloc_time, and output_dealloc_time.

Reviewed By: ajyu

Differential Revision: D26657473

fbshipit-source-id: 1cf282192b531513b9ee40b37252087818412f81
  • Loading branch information
Hao Lu authored and facebook-github-bot committed Feb 28, 2021
1 parent 249c213 commit a296fa3
Show file tree
Hide file tree
Showing 2 changed files with 214 additions and 141 deletions.
315 changes: 174 additions & 141 deletions caffe2/core/net_simple.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,162 +128,195 @@ vector<float> SimpleNet::TEST_Benchmark(
<< ". Iters per second: " << 1000.0 * main_runs / millis
<< std::endl;
}
vector<float> time_per_op(operators_.size(), 0);
vector<uint64_t> flops_per_op;
vector<uint64_t> memory_bytes_read_per_op;
vector<uint64_t> memory_bytes_written_per_op;
vector<uint64_t> param_bytes_per_op;
CaffeMap<string, float> time_per_op_type;
CaffeMap<string, float> flops_per_op_type;
CaffeMap<string, float> memory_bytes_read_per_op_type;
CaffeMap<string, float> memory_bytes_written_per_op_type;
CaffeMap<string, float> param_bytes_per_op_type;

auto operators = GetOperators();
auto results = IndividualMetrics(operators);
if (run_individual) {
for (int i = 0; i < main_runs; ++i) {
for (auto& op : operators_) {
op->ResetEvent();
}
int idx = 0;
for (auto& op : operators_) {
const string& op_type = op->debug_def().type();
if (i == 0) { // Gather flops on the first run.
auto* schema = OpSchemaRegistry::Schema(op_type);
if (schema && schema->HasCostInferenceFunction()) {
vector<TensorShape> shapes = op->InputTensorShapes();

auto all_good_shapes = std::accumulate(
shapes.begin(),
shapes.end(),
true,
[](bool acc, const TensorShape& shape) {
return acc && !shape.unknown_shape();
});
OpSchema::Cost cost;
if (all_good_shapes) {
cost = schema->InferCost(op->debug_def(), shapes);
}
results.RunOpsWithProfiling();
}
results.PrintOperatorProfilingResults();
}
// We will reuse time_per_op to return the result of BenchmarkNet.
std::vector<float> time_per_op(results.GetTimePerOp());
for (size_t i = 0; i < time_per_op.size(); ++i) {
time_per_op[i] /= main_runs;
}
if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
time_per_op.insert(time_per_op.begin(), millis / main_runs);
}
return time_per_op;
}

flops_per_op.emplace_back(cost.flops);
memory_bytes_read_per_op.emplace_back(cost.bytes_read);
memory_bytes_written_per_op.emplace_back(cost.bytes_written);
param_bytes_per_op.emplace_back(cost.params_bytes);
void IndividualMetrics::RunOpsWithProfiling() {
int idx = 0;
Timer timer;
for (auto* op : operators_) {
const string& op_type = op->debug_def().type();
if (main_runs_ == 0) { // Gather flops on the first run.
auto* schema = OpSchemaRegistry::Schema(op_type);
if (schema && schema->HasCostInferenceFunction()) {
vector<TensorShape> shapes = op->InputTensorShapes();

flops_per_op_type[op_type] += cost.flops;
memory_bytes_read_per_op_type[op_type] += cost.bytes_read;
memory_bytes_written_per_op_type[op_type] += cost.bytes_written;
param_bytes_per_op_type[op_type] += cost.params_bytes;
} else {
flops_per_op.emplace_back(0);
memory_bytes_read_per_op.emplace_back(0);
memory_bytes_written_per_op.emplace_back(0);
param_bytes_per_op.emplace_back(0);
}
auto all_good_shapes = std::accumulate(
shapes.begin(),
shapes.end(),
true,
[](bool acc, const TensorShape& shape) {
return acc && !shape.unknown_shape();
});
OpSchema::Cost cost;
if (all_good_shapes) {
cost = schema->InferCost(op->debug_def(), shapes);
}
timer.Start();
CAFFE_ENFORCE(
op->Run(),
"operator ",
op->debug_def().name(),
"(",
op_type,
") has failed.");
float spent = timer.MilliSeconds();
time_per_op[idx] += spent;
time_per_op_type[op_type] += spent;
++idx;

flops_per_op.emplace_back(cost.flops);
memory_bytes_read_per_op.emplace_back(cost.bytes_read);
memory_bytes_written_per_op.emplace_back(cost.bytes_written);
param_bytes_per_op.emplace_back(cost.params_bytes);

flops_per_op_type[op_type] += cost.flops;
memory_bytes_read_per_op_type[op_type] += cost.bytes_read;
memory_bytes_written_per_op_type[op_type] += cost.bytes_written;
param_bytes_per_op_type[op_type] += cost.params_bytes;
} else {
flops_per_op.emplace_back(0);
memory_bytes_read_per_op.emplace_back(0);
memory_bytes_written_per_op.emplace_back(0);
param_bytes_per_op.emplace_back(0);
}
}
size_t idx = 0;
for (auto& op : operators_) {
const string& op_type = op->debug_def().type();
const string& print_name =
(op->debug_def().name().size()
? op->debug_def().name()
: (op->debug_def().output_size() ? op->debug_def().output(0)
: "NO_OUTPUT"));
std::stringstream flops_str;
if (idx < flops_per_op.size() && flops_per_op[idx]) {
flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, "
<< to_string(
1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
main_runs)
<< " GFLOPS)";
}
std::stringstream memory_bytes_read_str;
if (idx < memory_bytes_read_per_op.size() &&
memory_bytes_read_per_op[idx]) {
memory_bytes_read_str
<< " (" << to_string(1.0e-6 * memory_bytes_read_per_op[idx])
<< " MB)";
}
std::stringstream memory_bytes_written_str;
if (idx < memory_bytes_written_per_op.size() &&
memory_bytes_written_per_op[idx]) {
memory_bytes_written_str
<< " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx])
<< " MB)";
}
std::stringstream param_bytes_str;
if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) {
param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx])
<< " MB)";
}
std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type
<< ") " << time_per_op[idx] / main_runs << " ms/iter"
<< flops_str.str() << memory_bytes_written_str.str()
<< param_bytes_str.str() << std::endl;
++idx;
timer.Start();
CAFFE_ENFORCE(
op->Run(),
"operator ",
op->debug_def().name(),
"(",
op_type,
") has failed.");
float spent = timer.MilliSeconds();
time_per_op[idx] += spent;
time_per_op_type[op_type] += spent;
++idx;
}
++main_runs_;
}

void IndividualMetrics::PrintOperatorProfilingResults() {
for (auto& op : operators_) {
op->ResetEvent();
}
size_t idx = 0;
for (auto& op : operators_) {
const string& op_type = op->debug_def().type();
num_ops_per_op_type_[op_type]++;
const string& print_name =
(op->debug_def().name().size()
? op->debug_def().name()
: (op->debug_def().output_size() ? op->debug_def().output(0)
: "NO_OUTPUT"));
std::stringstream flops_str;
if (idx < flops_per_op.size() && flops_per_op[idx]) {
flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, "
<< to_string(
1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
main_runs_)
<< " GFLOPS)";
}
std::stringstream memory_bytes_read_str;
if (idx < memory_bytes_read_per_op.size() &&
memory_bytes_read_per_op[idx]) {
memory_bytes_read_str << " ("
<< to_string(1.0e-6 * memory_bytes_read_per_op[idx])
<< " MB)";
}
const std::vector<string> metric({"Time",
"FLOP",
"Feature Memory Read",
"Feature Memory Written",
"Parameter Memory"});
const std::vector<double> normalizer(
{1.0 / main_runs, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6});
const std::vector<string> unit({"ms", "GFLOP", "MB", "MB", "MB"});
std::stringstream memory_bytes_written_str;
if (idx < memory_bytes_written_per_op.size() &&
memory_bytes_written_per_op[idx]) {
memory_bytes_written_str
<< " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx])
<< " MB)";
}
std::stringstream param_bytes_str;
if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) {
param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx])
<< " MB)";
}
std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type
<< ") " << time_per_op[idx] / main_runs_ << " ms/iter"
<< flops_str.str() << memory_bytes_written_str.str()
<< param_bytes_str.str() << std::endl;
++idx;
}
const std::vector<string> metric(
{"Time",
"FLOP",
"Feature Memory Read",
"Feature Memory Written",
"Parameter Memory"});
const std::vector<double> normalizer(
{1.0 / main_runs_, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6});
const std::vector<string> unit({"ms", "GFLOP", "MB", "MB", "MB"});

std::vector<CaffeMap<string, float>*> metric_per_op_type_vec_vec;
metric_per_op_type_vec_vec.emplace_back(&time_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&param_bytes_per_op_type);
for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
std::vector<CaffeMap<string, float>*> metric_per_op_type_vec_vec;
metric_per_op_type_vec_vec.emplace_back(&time_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
metric_per_op_type_vec_vec.emplace_back(&param_bytes_per_op_type);
for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
auto* item = metric_per_op_type_vec_vec[i];
std::vector<std::pair<string, float>> metric_per_op_type_vec(
(*item).begin(), (*item).end());
std::sort(
metric_per_op_type_vec.begin(),
metric_per_op_type_vec.end(),
PairLargerThan<string, float>);
float total_metric = 0.;
for (const auto& op_item : metric_per_op_type_vec) {
total_metric += op_item.second * normalizer[i];
}
if (total_metric > 0.) {
std::cout << metric[i] << " per operator type:" << std::endl;
auto* item = metric_per_op_type_vec_vec[i];
std::vector<std::pair<string, float>> metric_per_op_type_vec(
(*item).begin(), (*item).end());
std::sort(
metric_per_op_type_vec.begin(),
metric_per_op_type_vec.end(),
PairLargerThan<string, float>);
float total_metric = 0.;
for (const auto& op_item : metric_per_op_type_vec) {
total_metric += op_item.second * normalizer[i];
}
for (const auto& op_item : metric_per_op_type_vec) {
float percent = 0.;
if (total_metric > 0.) {
percent = (100.0 * op_item.second * normalizer[i] / total_metric);
}
std::cout << std::setw(15) << std::setfill(' ')
<< op_item.second * normalizer[i] << " " << unit[i] << ". "
<< std::setw(10) << std::setfill(' ') << percent << "%. "
<< op_item.first << std::endl;
}
for (const auto& op_item : metric_per_op_type_vec) {
float percent = 0.;
const string& op = op_item.first;
float value = op_item.second;
if (total_metric > 0.) {
percent = (100.0 * value * normalizer[i] / total_metric);
}
std::cout << std::setw(15) << std::setfill(' ') << value * normalizer[i]
<< " " << unit[i] << ". " << std::setw(10) << std::setfill(' ')
<< percent << "%. " << op << " (" << num_ops_per_op_type_[op]
<< " ops)" << std::endl;
}
if (total_metric > 0.) {
std::cout << std::setw(15) << std::setfill(' ') << total_metric << " "
<< unit[i] << " in Total" << std::endl;
}
if (i == 0) {
if (setup_time > 0) {
std::cout << "BlackBoxPredictor setup time: "
<< setup_time * normalizer[i] << " " << unit[i] << "\n";
}
if (memory_alloc_time > 0) {
std::cout << "Memory allocation time: "
<< memory_alloc_time * normalizer[i] << " " << unit[i]
<< "\n";
}
if (memory_dealloc_time > 0) {
std::cout << "Memory deallocation time: "
<< memory_dealloc_time * normalizer[i] << " " << unit[i]
<< std::endl;
}
if (output_dealloc_time > 0) {
std::cout << "Output deallocation time: "
<< output_dealloc_time * normalizer[i] << " " << unit[i]
<< std::endl;
}
}
}
// We will reuse time_per_op to return the result of BenchmarkNet.
for (size_t i = 0; i < time_per_op.size(); ++i) {
time_per_op[i] /= main_runs;
}
if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
time_per_op.insert(time_per_op.begin(), millis / main_runs);
}
return time_per_op;
}

REGISTER_NET(simple, SimpleNet);
Expand Down
40 changes: 40 additions & 0 deletions caffe2/core/net_simple.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,46 @@

namespace caffe2 {

struct IndividualMetrics {
public:
explicit IndividualMetrics(const std::vector<OperatorBase*>& operators)
: main_runs_(0), operators_(operators) {
const auto num_ops = operators_.size();
time_per_op.resize(num_ops, 0.0);
}
// run ops while collecting profiling results
void RunOpsWithProfiling();

// print out profiling results
void PrintOperatorProfilingResults();

const vector<float>& GetTimePerOp() {
return time_per_op;
}

float setup_time{0.0};
float memory_alloc_time{0.0};
float memory_dealloc_time{0.0};
float output_dealloc_time{0.0};

private:
int main_runs_;
const std::vector<OperatorBase*>& operators_;

vector<float> time_per_op;
vector<uint64_t> flops_per_op;
vector<uint64_t> memory_bytes_read_per_op;
vector<uint64_t> memory_bytes_written_per_op;
vector<uint64_t> param_bytes_per_op;

CaffeMap<string, int> num_ops_per_op_type_;
CaffeMap<string, float> time_per_op_type;
CaffeMap<string, float> flops_per_op_type;
CaffeMap<string, float> memory_bytes_read_per_op_type;
CaffeMap<string, float> memory_bytes_written_per_op_type;
CaffeMap<string, float> param_bytes_per_op_type;
};

// This is the very basic structure you need to run a network - all it
// does is simply to run everything in sequence. If you want more fancy control
// such as a DAG-like execution, check out other better net implementations.
Expand Down

0 comments on commit a296fa3

Please sign in to comment.