Quantization oneDNN backend only support VNNI CPU (pytorch#103653)

**Summary** - Update the quantization document that default qconfig with oneDNN backend is recommended to be used on CPUs with Vector Neural Network Instruction support. - Add the warning message when user uses default qconfig with oneDNN backend on CPU without Vector Neural Network Instruction support. Pull Request resolved: pytorch#103653 Approved by: https://github.com/jgong5, https://github.com/malfet
heysaeed · Jun 19, 2023 · 9832cfb · 9832cfb
1 parent 7b3242d
commit 9832cfb
Show file tree

Hide file tree

Showing 11 changed files with 73 additions and 1 deletion.
diff --git a/aten/src/ATen/cpu/Utils.cpp b/aten/src/ATen/cpu/Utils.cpp
@@ -0,0 +1,12 @@
+#include <ATen/cpu/Utils.h>
+#include <cpuinfo.h>
+
+namespace at {
+namespace cpu {
+
+bool is_cpu_support_vnni() {
+  return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
+}
+
+} // namespace cpu
+} // namespace at
diff --git a/aten/src/ATen/cpu/Utils.h b/aten/src/ATen/cpu/Utils.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+namespace at {
+namespace cpu {
+
+// Detect if CPU support Vector Neural Network Instruction.
+TORCH_API bool is_cpu_support_vnni();
+
+} // namespace cpu
+} // namespace at
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -900,6 +900,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/tensor_types.cpp",
     "torch/csrc/utils/disable_torch_function.cpp",
     "torch/csrc/utils/verbose.cpp",
+    "torch/csrc/cpu/Module.cpp",
 ] + lazy_tensor_core_python_sources
 
 libtorch_python_distributed_core_sources = [
@@ -1082,6 +1083,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/vulkan/Context.cpp",
     "aten/src/ATen/native/prim_native_functions.cpp",
     "aten/src/ATen/native/verbose_wrapper.cpp",
+    "aten/src/ATen/cpu/Utils.cpp",
 ] + aten_cpu_non_globed_sources
 
 aten_cpu_source_codegen_list = [

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
@@ -1009,6 +1009,10 @@ if ``dtype`` is ``torch.qint8``, make sure to set a custom ``quant_min`` to be `
 you call the `torch.ao.quantization.get_default_qconfig(backend)` or `torch.ao.quantization.get_default_qat_qconfig(backend)` function to get the default ``qconfig`` for
 ``x86`` or ``qnnpack`` backend
 
+2. If ``onednn`` backend is selected, 8 bits for activation will be used in the default qconfig mapping ``torch.ao.quantization.get_default_qconfig_mapping('onednn')``
+and default qconfig ``torch.ao.quantization.get_default_qconfig('onednn')``. It is recommended to be used on CPUs with Vector Neural Network Instruction (VNNI)
+support. Otherwise, setting ``reduce_range`` to True of the activation's observer to get better accuracy on CPUs without VNNI support.
+
 Frequently Asked Questions
 --------------------------
 

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -50,7 +50,7 @@ from torch.types import (
 
 # This module is defined in torch/csrc/Module.cpp
 
-from . import _functorch, _lazy, _lazy_ts_backend, _nn, _onnx, _VariableFunctions
+from . import _functorch, _lazy, _lazy_ts_backend, _nn, _onnx, _VariableFunctions, _cpu
 
 T = TypeVar("T")
 S = TypeVar("S", bound="torch.Tensor")

diff --git a/torch/_C/_cpu.pyi b/torch/_C/_cpu.pyi
@@ -0,0 +1,5 @@
+from torch.types import _bool
+
+# Defined in torch/csrc/cpu/Module.cpp
+
+def _is_cpu_support_vnni() -> _bool: ...
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
@@ -245,6 +245,10 @@ def get_default_qconfig(backend='x86', version=0):
             qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False),
                               weight=default_weight_observer)
         elif backend == 'onednn':
+            if not torch.cpu._is_cpu_support_vnni():
+                warnings.warn(
+                    "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues "
+                    "on CPU without Vector Neural Network Instruction support.")
             qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False),
                               weight=default_per_channel_weight_observer)
         elif backend == 'x86':

diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py
@@ -1 +1,6 @@
 from . import amp
+import torch
+
+def _is_cpu_support_vnni() -> bool:
+    r"""Returns a bool indicating if CPU supports VNNI."""
+    return torch._C._cpu._is_cpu_support_vnni()
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -53,6 +53,7 @@
 #include <torch/csrc/autograd/python_sparse_functions.h>
 #include <torch/csrc/autograd/python_special_functions.h>
 #include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/cpu/Module.h>
 #include <torch/csrc/dynamo/init.h>
 #include <torch/csrc/functorch/init.h>
 #include <torch/csrc/jit/python/init.h>
@@ -1371,6 +1372,7 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   torch::cuda::initModule(module);
 #endif
+  torch::cpu::initModule(module);
   torch::initVerboseBindings(module);
   ASSERT_TRUE(THPStorage_init(module));
 

diff --git a/torch/csrc/cpu/Module.cpp b/torch/csrc/cpu/Module.cpp
@@ -0,0 +1,16 @@
+#include <ATen/cpu/Utils.h>
+#include <torch/csrc/cpu/Module.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace cpu {
+
+void initModule(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  auto cpu = m.def_submodule("_cpu", "cpu related pybind.");
+  cpu.def("_is_cpu_support_vnni", at::cpu::is_cpu_support_vnni);
+}
+
+} // namespace cpu
+} // namespace torch
diff --git a/torch/csrc/cpu/Module.h b/torch/csrc/cpu/Module.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+namespace cpu {
+
+void initModule(PyObject* module);
+
+} // namespace cpu
+} // namespace torch