From ca65281d692df36a6bed68bceaff2f2e05cdd6d3 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Tue, 25 Jun 2024 14:52:32 +0800
Subject: [PATCH 01/17] add standard int4 op for woq

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../algorithms/weight_only/gptq.py             | 18 +++++++++++++++++-
 .../algorithms/weight_only/rtn.py              | 16 ++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
index 4a7b35b31..a6ed82e9b 100644
--- a/onnx_neural_compressor/algorithms/weight_only/gptq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -324,7 +324,23 @@ def gptq_quantize(
             satisfy_MatMulFpQ4_condition = (
                 Version(ort.__version__) >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
             )
-            if ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
+            if model.opset_import[0].version > 20:
+                q_weight, scale, zp = woq_utility.quant_tensor(
+                    weight.T, num_bits, group_size, scheme, "uint"
+                )
+                k_blocks = (org_shape[0] + group_size - 1) // group_size
+                dequant_node, new_inits = woq_utility.make_weight_only_dequant_node(
+                    node=node,
+                    q_weight=q_weight.astype("uint8"),
+                    k_blocks=k_blocks,
+                    scale=scale.astype(dtype),
+                    axis=1,
+                    block_size=group_size,
+                    zero_point=zp if scheme == "asym" else None,
+                )
+                model.add_initializers(new_inits)
+                model.add_node(dequant_node)
+            elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
                 "CUDAExecutionProvider" not in providers
                 and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
             ):
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index d4ca7e55e..186dcb986 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -105,6 +105,22 @@ def rtn_quantize(
             satisfy_MatMulFpQ4_condition = (
                 version.Version(ort.__version__) >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
             )
+            if model.model.opset_import[0].version > 20:
+                q_weight, scale, zp = woq_utility.quant_tensor(
+                    weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
+                )
+                dequant_node, new_inits = woq_utility.make_weight_only_dequant_node(
+                    node=node,
+                    num_bits=num_bits,
+                    k_blocks=k_blocks,
+                    q_weight=q_weight.astype("uint8"),
+                    scale=scale.astype(dtype),
+                    axis=1,
+                    block_size=group_size,
+                    zero_point=zp if scheme == "asym" else None,
+                )
+                model.add_initializers(new_inits)
+                new_nodes.append(dequant_node)
             if ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
                 "CUDAExecutionProvider" not in providers
                 and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)

From 821ddc48ed759f97343afce021ce5e302c17fc6b Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Wed, 31 Jul 2024 04:21:24 -0700
Subject: [PATCH 02/17] support int4 dequant

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 onnx_neural_compressor/algorithms/utility.py  | 65 +++++++++++++++++++
 .../algorithms/weight_only/rtn.py             | 18 +++--
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index 45c99c207..811dd8093 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -298,6 +298,71 @@ def _get_blob_size(group_size, has_zp):  # pragma: no cover
     return blob_size
 
 
+def make_weight_only_dequant_node(node, block_size, k_blocks, num_bits, q_weight, scale, zero_point, axis=1):
+    """Build DequantizeLinear node.
+    Args:
+        node: original matmul node
+        block_size (int): how many elements share one scale/zp
+        k_blocks (int): block number
+        num_bits (int): num_bits
+        q_weight (array): quantized weight
+        scale (array): scale
+        zero_point (array): zero point
+        axis (int): the axis of the dequantizing dimension of the input tensor
+    Returns:
+        weight_only_dequant_node: DequantizeLinear node for weight dequantization
+        new_inits: initializers of the new node
+    """
+    new_inits = []
+    input_names = []
+    kwargs = {
+        "block_size": block_size,
+        "axis": axis
+        }
+
+    q_weight_tensor = onnx.helper.make_tensor(
+        name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(block_size)),
+        data_type=onnx.TensorProto.UINT4,
+        dims=q_weight.shape,
+        vals=q_weight,
+        raw=False,
+    )
+    new_inits.append(q_weight_tensor)
+    input_names.append(q_weight_tensor.name)
+
+    scale_tensor = onnx.helper.make_tensor(
+        name=node.input[1] + "_scale",
+        data_type=onnx.helper.np_dtype_to_tensor_dtype(scale.dtype),
+        dims=scale.shape,
+        vals=scale.tobytes(),
+        raw=True,
+    )
+    input_names.append(scale_tensor.name)
+    new_inits.append(scale_tensor)
+
+    # build zero_point tensor
+    zp_shape = zero_point.shape
+    zp_tensor = onnx.helper.make_tensor(
+        name=node.input[1] + "_zp",
+        data_type=onnx.TensorProto.UINT4,
+        dims=zp_shape,
+        vals=zero_point,
+        raw=False,
+    )
+    input_names.append(zp_tensor.name)
+    new_inits.append(zp_tensor)
+
+    dequant_node = onnx.helper.make_node(
+        "DequantizeLinear",
+        inputs=input_names,
+        outputs=[q_weight_tensor.name + "_dequant"],
+        name=node.name + "_woq_dequant",
+        **kwargs,
+    )
+    node.input[1] = dequant_node.output[0]
+    return dequant_node, new_inits
+
+
 def make_matmul_weight_only_node(
     node: onnx.NodeProto,
     weight_shape: tuple,
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index 186dcb986..5cf540b65 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -105,23 +105,27 @@ def rtn_quantize(
             satisfy_MatMulFpQ4_condition = (
                 version.Version(ort.__version__) >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
             )
-            if model.model.opset_import[0].version > 20:
-                q_weight, scale, zp = woq_utility.quant_tensor(
-                    weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
+            if model.model.opset_import[0].version <= 20:
+                _, _, zp, scale, q_weight =quant_utils.quantize_data(
+                    weight.T.reshape((-1, group_size)),
+                    "uint" + str(num_bits),
+                    sym,
+                    ratio=ratios.get(node.input[1], 1),
+                    axis=1,
                 )
-                dequant_node, new_inits = woq_utility.make_weight_only_dequant_node(
+                dequant_node, new_inits =quant_utils.make_weight_only_dequant_node(
                     node=node,
                     num_bits=num_bits,
                     k_blocks=k_blocks,
-                    q_weight=q_weight.astype("uint8"),
+                    q_weight=q_weight.reshape(weight.T.shape).T,
                     scale=scale.astype(dtype),
                     axis=1,
                     block_size=group_size,
-                    zero_point=zp if scheme == "asym" else None,
+                    zero_point=zp,
                 )
                 model.add_initializers(new_inits)
                 new_nodes.append(dequant_node)
-            if ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
+            elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
                 "CUDAExecutionProvider" not in providers
                 and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
             ):  # pragma: no cover

From f092c808ed6fdce283cf07b6ee44c44cf37f1a3a Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Tue, 6 Aug 2024 21:38:25 -0700
Subject: [PATCH 03/17] support int4 QDQ

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../quantization/weight_only/main.py          |  7 +++---
 onnx_neural_compressor/algorithms/utility.py  | 25 +++++++++++--------
 .../algorithms/weight_only/rtn.py             | 17 ++++++++-----
 .../quantization/algorithm_entry.py           |  4 ++-
 onnx_neural_compressor/quantization/config.py |  3 +++
 .../quantization/matmul_nbits_quantizer.py    | 14 ++++++++---
 6 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
index e327aa827..1fddc5ca7 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
@@ -34,7 +34,7 @@
 from torch.utils import data
 
 from onnx_neural_compressor import data_reader
-from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning
+from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning, QuantFormat
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
@@ -138,7 +138,7 @@ def eval_func(model):
     if isinstance(model, str) and model.endswith(".onnx"):
         model_dir = os.path.dirname(model)
 
-    replace_architectures(os.path.join(model_dir, "config.json"))
+    #replace_architectures(os.path.join(model_dir, "config.json"))
 
     eval_args = evaluation.LMEvalParser(
         model="hf",
@@ -147,6 +147,7 @@ def eval_func(model):
         tasks=",".join(args.tasks),
         provider="CPUExecutionProvider",
         trust_remote_code=args.trust_remote_code,
+        limit=10
     )
     results = evaluation.evaluate(eval_args)
 
@@ -348,7 +349,7 @@ def rewind(self):
         nodes_to_exclude = ["/lm_head/MatMul"] if not args.quantize_lm_head else []
         nodes_to_exclude = list(set(args.nodes_to_exclude + nodes_to_exclude))
         if args.algorithm.upper() == "RTN":
-            algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=args.layer_wise)
+            algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=args.layer_wise, quant_format=QuantFormat.QDQ)
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
                 model_path,
                 n_bits=4,
diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index 811dd8093..796b72950 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -254,8 +254,8 @@ def quantize_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None):
         axis (int, optional): process data along a specific axis. Default is None (process the whole data)
     """
     quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym)
-    rmin = np.min(np.min(data), 0) if axis is None else np.min(data, axis=1, keepdims=True)
-    rmax = np.max(np.max(data), 0) if axis is None else np.max(data, axis=1, keepdims=True)
+    rmin = np.min(np.min(data), 0) if axis is None else np.min(data, axis=axis, keepdims=True)
+    rmax = np.max(np.max(data), 0) if axis is None else np.max(data, axis=axis, keepdims=True)
     rmin *= ratio
     rmax *= ratio
 
@@ -298,10 +298,11 @@ def _get_blob_size(group_size, has_zp):  # pragma: no cover
     return blob_size
 
 
-def make_weight_only_dequant_node(node, block_size, k_blocks, num_bits, q_weight, scale, zero_point, axis=1):
+def make_weight_only_dequant_node(node, weight_shape, block_size, k_blocks, num_bits, q_weight, scale, zero_point, axis=1):
     """Build DequantizeLinear node.
     Args:
         node: original matmul node
+        weight_shape (tuple): original weight shape
         block_size (int): how many elements share one scale/zp
         k_blocks (int): block number
         num_bits (int): num_bits
@@ -320,16 +321,19 @@ def make_weight_only_dequant_node(node, block_size, k_blocks, num_bits, q_weight
         "axis": axis
         }
 
+    q_weight_pairs = q_weight[::2, :] | q_weight[1::2, :] << 4
+
     q_weight_tensor = onnx.helper.make_tensor(
         name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(block_size)),
         data_type=onnx.TensorProto.UINT4,
-        dims=q_weight.shape,
-        vals=q_weight,
-        raw=False,
+        dims=weight_shape,
+        vals=q_weight_pairs.T.flatten().tobytes(),
+        raw=True,
     )
     new_inits.append(q_weight_tensor)
     input_names.append(q_weight_tensor.name)
 
+    #scale = scale.reshape((-1, weight_shape[-1]))
     scale_tensor = onnx.helper.make_tensor(
         name=node.input[1] + "_scale",
         data_type=onnx.helper.np_dtype_to_tensor_dtype(scale.dtype),
@@ -341,13 +345,14 @@ def make_weight_only_dequant_node(node, block_size, k_blocks, num_bits, q_weight
     new_inits.append(scale_tensor)
 
     # build zero_point tensor
-    zp_shape = zero_point.shape
+    packed_zp = zero_point[:, ::2] | zero_point[:, 1::2] << 4
+
     zp_tensor = onnx.helper.make_tensor(
         name=node.input[1] + "_zp",
         data_type=onnx.TensorProto.UINT4,
-        dims=zp_shape,
-        vals=zero_point,
-        raw=False,
+        dims=scale.shape,
+        vals=packed_zp.flatten().tobytes(),
+        raw=True,
     )
     input_names.append(zp_tensor.name)
     new_inits.append(zp_tensor)
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index 5cf540b65..f1cd180f4 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -35,6 +35,7 @@ def rtn_quantize(
     weight_config: dict = {},
     ratios: dict = {},
     providers: List[str] = ["CPUExecutionProvider"],
+    quant_format: int = 0,
     return_modelproto: bool = True,
 ):
     """Quantize the model with round to nearst method.
@@ -105,23 +106,24 @@ def rtn_quantize(
             satisfy_MatMulFpQ4_condition = (
                 version.Version(ort.__version__) >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
             )
-            if model.model.opset_import[0].version <= 20:
+            if quant_format == 1:
                 _, _, zp, scale, q_weight =quant_utils.quantize_data(
                     weight.T.reshape((-1, group_size)),
                     "uint" + str(num_bits),
-                    sym,
+                    False,
                     ratio=ratios.get(node.input[1], 1),
                     axis=1,
                 )
                 dequant_node, new_inits =quant_utils.make_weight_only_dequant_node(
                     node=node,
+                    weight_shape=org_w_shape,
                     num_bits=num_bits,
                     k_blocks=k_blocks,
-                    q_weight=q_weight.reshape(weight.T.shape).T,
-                    scale=scale.astype(dtype),
-                    axis=1,
+                    q_weight=q_weight.reshape((-1, org_w_shape[-1])),
+                    scale=scale.reshape((org_w_shape[-1], -1)).T.astype(dtype),
+                    axis=0,
                     block_size=group_size,
-                    zero_point=zp,
+                    zero_point=zp.reshape((org_w_shape[-1], -1)).T,
                 )
                 model.add_initializers(new_inits)
                 new_nodes.append(dequant_node)
@@ -197,12 +199,14 @@ def apply_rtn_on_model(
     ratios: dict = {},
     providers: List[str] = ["CPUExecutionProvider"],
     layer_wise_quant: bool = False,
+    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Apply RTN on onnx model.
 
     Args:
         model (Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str]): onnx model.
         quant_config (dict): quantization config.
+        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 means QDQ. Default 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -210,6 +214,7 @@ def apply_rtn_on_model(
     quant_kwargs = {
         "ratios": ratios,
         "providers": providers,
+        "quant_format": quant_format,
     }
 
     if layer_wise_quant:
diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index 12689fa7e..45b9cb73d 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -40,7 +40,9 @@ def rtn_quantize_entry(
     else:
         config_mapping = quant_config.config_mapping
     quant_kwargs = {}
-    quant_kwargs = {key: getattr(quant_config, key) for key in config.RTNConfig.model_params_list}
+    for key in config.RTNConfig.model_params_list:
+        val = getattr(quant_config, key)
+        quant_kwargs[key] = getattr(val, "value", val)
     model = rtn.apply_rtn_on_model(model, config_mapping, **quant_kwargs)
     return model
 
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index f4fe2672e..c5836ac26 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -731,6 +731,7 @@ class RTNConfig(BaseConfig):
     model_params_list: List[str] = [
         "providers",
         "layer_wise_quant",
+        "quant_format",
     ]
     name: str = constants.RTN
 
@@ -746,6 +747,7 @@ def __init__(
         providers: List[str] = ["CPUExecutionProvider"],
         layer_wise_quant: bool = False,
         quant_last_matmul: bool = True,
+        quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
         white_list: List[Union[str, Callable]] = constants.RTN_OP_LIST,
     ):
         """Init RTN weight-only quantization config.
@@ -780,6 +782,7 @@ def __init__(
         self.providers = providers
         self.layer_wise_quant = layer_wise_quant
         self.quant_last_matmul = quant_last_matmul
+        self.quant_format = quant_format
         self._post_init()
 
     def _post_init(self):
diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
index 99bf760e9..e9c3dfb03 100644
--- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
@@ -22,11 +22,11 @@
 
 from onnx_neural_compressor import data_reader, logger, onnx_model, utility
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import config
+from onnx_neural_compressor.quantization import config, QuantFormat
 
 
 class WeightOnlyQuantConfig:
-    def __init__(self, algorithm):
+    def __init__(self, algorithm, quant_format=QuantFormat.QOperator):
         """This is the Base class for Weight Only Quant Configuration.
 
         Args:
@@ -34,13 +34,15 @@ def __init__(self, algorithm):
                 weight only quantize algorithm name.
         """
         self.algorithm = algorithm
+        self.quant_format = quant_format
 
 
 class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
 
-    def __init__(self, ratios=None, layer_wise_quant=False):
+    def __init__(self, ratios=None, layer_wise_quant=False, quant_format=QuantFormat.QOperator):
         super().__init__(
             algorithm="RTN",
+            quant_format=quant_format,
         )
         if ratios is None:
             ratios = {}
@@ -59,9 +61,11 @@ def __init__(
         mse=False,
         perchannel=True,
         layer_wise_quant=False,
+        quant_format=QuantFormat.QOperator,
     ):
         super().__init__(
             algorithm="GPTQ",
+            quant_format=quant_format,
         )
         self.calibration_data_reader = calibration_data_reader
         self.percdamp = percdamp
@@ -79,8 +83,9 @@ def __init__(
         calibration_data_reader: data_reader.CalibrationDataReader,
         enable_auto_scale=True,
         enable_mse_search=True,
+        quant_format=QuantFormat.QOperator,
     ):
-        super().__init__(algorithm="AWQ")
+        super().__init__(algorithm="AWQ", quant_format=quant_format)
         self.calibration_data_reader = calibration_data_reader
         self.enable_auto_scale = enable_auto_scale
         self.enable_mse_search = enable_mse_search
@@ -133,6 +138,7 @@ def _generate_nc_config(self):
             "weight_sym": self.is_symmetric,
             "accuracy_level": self.accuracy_level,
             "providers": self.providers,
+            "quant_format": self.algo_config.quant_format,
         }
         if self.algorithm == "RTN":
             quant_kwargs.update(

From 5d7ee3414bc70fc2d168984156a637e96ca7d910 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Wed, 14 Aug 2024 02:32:05 -0700
Subject: [PATCH 04/17] add ut and enhance code

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../quantization/weight_only/README.md        |   8 +-
 .../quantization/weight_only/main.py          |  23 +-
 .../quantization/weight_only/run_benchmark.sh |  18 +-
 .../quantization/weight_only/run_quant.sh     |  26 +-
 .../algorithms/layer_wise/core.py             |   7 +-
 onnx_neural_compressor/algorithms/utility.py  |  78 +++--
 .../algorithms/weight_only/awq.py             |  73 ++---
 .../algorithms/weight_only/gptq.py            |  55 ++--
 .../algorithms/weight_only/rtn.py             |  55 ++--
 onnx_neural_compressor/constants.py           |   1 +
 onnx_neural_compressor/onnx_model.py          |  21 +-
 .../quantization/algorithm_entry.py           |  10 +-
 onnx_neural_compressor/quantization/config.py | 287 +++++++++---------
 .../quantization/matmul_4bits_quantizer.py    |   2 +
 .../quantization/matmul_nbits_quantizer.py    |  11 +-
 .../quantization/quant_utils.py               |   6 +
 test/quantization/weight_only/test_awq.py     |  53 +++-
 test/quantization/weight_only/test_gptq.py    |  51 +++-
 test/quantization/weight_only/test_rtn.py     |  45 ++-
 19 files changed, 539 insertions(+), 291 deletions(-)

diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
index 79b17c73f..0173ad33f 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
@@ -40,6 +40,11 @@ python prepare_model.py  --input_model="meta-llama/Llama-2-7b-hf" \
 ## 1. Quantization
 
 Set `algorithm=WOQ_TUNE` to tune weight-only quantization algorithm or specify algorithm to `RTN` or `GPTQ` or `AWQ`.
+`quant_format=QDQ` works only when:
+- onnxruntime >= 1.19.0
+- opset version of the model >= 21
+- quantized bits is in [4, 8]
+otherwise it will execute QOperator automatically.
 
 ```bash
 bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
@@ -47,7 +52,8 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
                   --batch_size=batch_size # optional \
                   --dataset=NeelNanda/pile-10k \
                   --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
-                  --algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ
+                  --algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ \
+                  --quant_format=QOperator # support QOperator and QDQ
 ```
 
 ## 2. Benchmark
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
index 1fddc5ca7..742afc18c 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
@@ -34,7 +34,7 @@
 from torch.utils import data
 
 from onnx_neural_compressor import data_reader
-from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning, QuantFormat
+from onnx_neural_compressor.quantization import QuantFormat, config, matmul_nbits_quantizer, tuning
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
@@ -105,6 +105,7 @@
     default=[],
     help="nodes that will not be quantized. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
 )
+parser.add_argument("--quant_format", type=str, default="QOperator", choices=["QOperator", "QDQ"])
 args = parser.parse_args()
 
 if args.tune and not os.path.exists(args.output_model):
@@ -138,7 +139,7 @@ def eval_func(model):
     if isinstance(model, str) and model.endswith(".onnx"):
         model_dir = os.path.dirname(model)
 
-    #replace_architectures(os.path.join(model_dir, "config.json"))
+    replace_architectures(os.path.join(model_dir, "config.json"))
 
     eval_args = evaluation.LMEvalParser(
         model="hf",
@@ -147,7 +148,6 @@ def eval_func(model):
         tasks=",".join(args.tasks),
         provider="CPUExecutionProvider",
         trust_remote_code=args.trust_remote_code,
-        limit=10
     )
     results = evaluation.evaluate(eval_args)
 
@@ -348,8 +348,11 @@ def rewind(self):
 
         nodes_to_exclude = ["/lm_head/MatMul"] if not args.quantize_lm_head else []
         nodes_to_exclude = list(set(args.nodes_to_exclude + nodes_to_exclude))
+        quant_format = QuantFormat.QOperator if args.quant_format == "QOperator" else QuantFormat.QDQ
         if args.algorithm.upper() == "RTN":
-            algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=args.layer_wise, quant_format=QuantFormat.QDQ)
+            algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(
+                layer_wise_quant=args.layer_wise, quant_format=quant_format
+            )
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
                 model_path,
                 n_bits=4,
@@ -364,7 +367,9 @@ def rewind(self):
         elif args.algorithm.upper() == "AWQ":
             calibration_data_reader = AWQDataloader(model_path, pad_max=args.pad_max, batch_size=1)
             algo_config = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig(
-                calibration_data_reader=calibration_data_reader, enable_mse_search=False
+                calibration_data_reader=calibration_data_reader,
+                enable_mse_search=False,
+                quant_format=quant_format,
             )
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
                 model_path,
@@ -380,7 +385,9 @@ def rewind(self):
         elif args.algorithm.upper() == "GPTQ":
             calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
             algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig(
-                calibration_data_reader=calibration_data_reader, layer_wise_quant=args.layer_wise
+                calibration_data_reader=calibration_data_reader,
+                layer_wise_quant=args.layer_wise,
+                quant_format=quant_format,
             )
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
                 model_path,
@@ -396,7 +403,9 @@ def rewind(self):
         elif args.algorithm.upper() == "WOQ_TUNE":
             calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
             # set tolerable_loss to 0.5% for test, default is 1%
-            custom_tune_config = tuning.TuningConfig(config_set=config.get_woq_tuning_config(), tolerable_loss=0.005)
+            custom_tune_config = tuning.TuningConfig(
+                config_set=config.get_woq_tuning_config(quant_format=quant_format), tolerable_loss=0.005
+            )
             best_model = tuning.autotune(
                 model_input=model_path,
                 tune_config=custom_tune_config,
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh
index 72348427c..97754097b 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh
@@ -46,15 +46,15 @@ function run_benchmark {
         extra_cmd="--trust_remote_code True"
     fi
 
-    eval "python main.py \
-            --model_path ${input_model} \
-            --batch_size=${batch_size-1} \
-            --tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \
-            --tasks=${tasks-lambada_openai} \
-            --mode=${mode} \
-            --intra_op_num_threads=${intra_op_num_threads-24} \
-            --benchmark \
-            ${extra_cmd}"
+    python main.py \
+      --model_path="${input_model}" \
+      --batch_size="${batch_size-1}" \
+      --tokenizer="${tokenizer-meta-llama/Llama-2-7b-hf}" \
+      --tasks="${tasks-lambada_openai}" \
+      --mode="${mode}" \
+      --intra_op_num_threads="${intra_op_num_threads-24}" \
+      --benchmark \
+      ${extra_cmd}
 
 }
 
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh
index 4198da9a8..255611b22 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh
@@ -29,6 +29,9 @@ function init_params {
       --algorithm=*)
           algorithm=$(echo $var |cut -f2 -d=)
       ;;
+      --quant_format=*)
+          quant_format=$(echo $var |cut -f2 -d=)
+      ;;
     esac
   done
 
@@ -69,17 +72,18 @@ function run_tuning {
         extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
     fi
 
-    eval "python main.py \
-            --model_path ${input_model} \
-	        --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
-            --output_model ${output_model} \
-            --batch_size ${batch_size-1} \
-            --dataset ${dataset-NeelNanda/pile-10k} \
-	        --algorithm ${algorithm-WOQ_TUNE} \
-	        --tasks ${tasks-lambada_openai} \
-            --layer_wise \
-            --tune \
-            ${extra_cmd}"
+    python main.py \
+      --model_path "${input_model}" \
+      --tokenizer "${tokenizer-meta-llama/Llama-2-7b-hf}" \
+      --output_model "${output_model}" \
+      --batch_size "${batch_size-1}" \
+      --dataset "${dataset-NeelNanda/pile-10k}" \
+      --algorithm "${algorithm-WOQ_TUNE}" \
+      --tasks "${tasks-lambada_openai}" \
+      --quant_format "${quant_format-QOperator}" \
+      --layer_wise \
+      --tune \
+      ${extra_cmd}
 }
 
 main "$@"
diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py
index 80077a9be..b2a7211b9 100644
--- a/onnx_neural_compressor/algorithms/layer_wise/core.py
+++ b/onnx_neural_compressor/algorithms/layer_wise/core.py
@@ -18,6 +18,7 @@
 import copy
 import os
 import pathlib
+import tempfile
 
 import onnx
 import onnxruntime as ort
@@ -60,7 +61,7 @@ def layer_wise_quant(
         model = onnx_model.ONNXModel(model, ignore_warning=True, load_external_data=False)
 
     origin_model = copy.deepcopy(model)
-
+    tmp_file = tempfile.TemporaryDirectory()
     providers = kwargs.get("providers", ["CPUExecutionProvider"])
 
     # get and check split nodes
@@ -97,7 +98,7 @@ def layer_wise_quant(
 
         # split model with given split node
         split_model_part_1, split_model_part_2 = split_model.split_model_with_node(
-            split_node.name, model.model_path, save_both_split_models
+            split_node.name, model.model_path, save_both_split_models, save_path=tmp_file.name
         )
 
         if not save_both_split_models:
@@ -201,6 +202,8 @@ def layer_wise_quant(
     onnx.external_data_helper.load_external_data_for_model(
         quantized_model_merged.model, os.path.dirname(quantized_model_merged.model_path)
     )
+
+    tmp_file.cleanup()
     return quantized_model_merged
 
 
diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index 796b72950..b38c8e21e 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -95,6 +95,17 @@ def attribute_to_kwarg(attribute):
 }
 
 
+ONNX_TENSOR_TYPE = {
+    "bfloat16": getattr(onnx.TensorProto, "BFLOAT16", 16),
+    "float32": getattr(onnx.TensorProto, "FLOAT", 1),
+    "float16": getattr(onnx.TensorProto, "FLOAT16", 10),
+    "int4": getattr(onnx.TensorProto, "INT4", 22),
+    "uint4": getattr(onnx.TensorProto, "UNT4", 21),
+    "int8": getattr(onnx.TensorProto, "INT8", 3),
+    "uint8": getattr(onnx.TensorProto, "UINT8", 2),
+}
+
+
 def _qType_to_np_type(qType):
     if isinstance(qType, int):
         return onnx.helper.tensor_dtype_to_np_dtype(qType)
@@ -211,11 +222,15 @@ def calculate_scale_zp(rmin, rmax, qType, sym, reduce_range=False):
     dtype = _qType_to_np_type(qType)
     if isinstance(rmax, np.ndarray):
         if sym:
-            max_range = np.maximum(abs(rmin), abs(rmax))
-            rmin = -max_range
-            rmax = max_range
-        scale = (rmax - rmin) / (qmax - qmin)
-        scale[scale < np.finfo(rmax.dtype).tiny] = 1
+            mask = abs(rmin) > abs(rmax)
+            scale = np.ones(rmin.shape).astype(rmin.dtype)
+            scale[mask] = rmin[mask]
+            scale[~mask] = rmax[~mask]
+            abs_max = round((qmax - qmin) / 2)
+            scale /= abs_max
+        else:
+            scale = (rmax - rmin) / (qmax - qmin)
+        scale[abs(scale) < np.finfo(rmax.dtype).tiny] = 1
         zero_point = (
             np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype)
             if sym
@@ -298,42 +313,59 @@ def _get_blob_size(group_size, has_zp):  # pragma: no cover
     return blob_size
 
 
-def make_weight_only_dequant_node(node, weight_shape, block_size, k_blocks, num_bits, q_weight, scale, zero_point, axis=1):
+def make_weight_only_dequant_node(
+    node: onnx.NodeProto,
+    weight_shape: tuple,
+    block_size: int,
+    num_bits: int,
+    dtype: str,
+    q_weight: np.array,
+    scale: np.array,
+    zero_point: np.array,
+    axis: int = 1,
+):
     """Build DequantizeLinear node.
     Args:
         node: original matmul node
         weight_shape (tuple): original weight shape
         block_size (int): how many elements share one scale/zp
-        k_blocks (int): block number
         num_bits (int): num_bits
+        dtype (str): use uint or int
         q_weight (array): quantized weight
         scale (array): scale
         zero_point (array): zero point
         axis (int): the axis of the dequantizing dimension of the input tensor
+
     Returns:
         weight_only_dequant_node: DequantizeLinear node for weight dequantization
         new_inits: initializers of the new node
     """
     new_inits = []
     input_names = []
-    kwargs = {
-        "block_size": block_size,
-        "axis": axis
-        }
+    kwargs = {"block_size": block_size, "axis": axis}
 
-    q_weight_pairs = q_weight[::2, :] | q_weight[1::2, :] << 4
+    q_weight = q_weight.reshape((-1, weight_shape[-1])).T
+    if num_bits == 4:
+        q_weight = ((q_weight[:, ::2] & 0xF | q_weight[:, 1::2] << 4) & 0xFF).astype("uint8")
+
+    qtype = ONNX_TENSOR_TYPE.get(dtype + str(num_bits), None)
+
+    if qtype is None:
+        raise ValueError(
+            "Unsupported qtype {}, only support {}".format(dtype + str(num_bits), list(ONNX_TENSOR_TYPE.keys()))
+        )
 
     q_weight_tensor = onnx.helper.make_tensor(
         name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(block_size)),
-        data_type=onnx.TensorProto.UINT4,
+        data_type=qtype,
         dims=weight_shape,
-        vals=q_weight_pairs.T.flatten().tobytes(),
+        vals=q_weight.flatten().tobytes(),
         raw=True,
     )
     new_inits.append(q_weight_tensor)
     input_names.append(q_weight_tensor.name)
 
-    #scale = scale.reshape((-1, weight_shape[-1]))
+    scale = scale.reshape((weight_shape[-1], -1)).T
     scale_tensor = onnx.helper.make_tensor(
         name=node.input[1] + "_scale",
         data_type=onnx.helper.np_dtype_to_tensor_dtype(scale.dtype),
@@ -345,13 +377,15 @@ def make_weight_only_dequant_node(node, weight_shape, block_size, k_blocks, num_
     new_inits.append(scale_tensor)
 
     # build zero_point tensor
-    packed_zp = zero_point[:, ::2] | zero_point[:, 1::2] << 4
+    zero_point = zero_point.reshape((weight_shape[-1], -1)).T
+    if num_bits == 4:
+        zero_point = ((zero_point[:, ::2] & 0xF | zero_point[:, 1::2] << 4) & 0xFF).astype("uint8")
 
     zp_tensor = onnx.helper.make_tensor(
         name=node.input[1] + "_zp",
-        data_type=onnx.TensorProto.UINT4,
+        data_type=qtype,
         dims=scale.shape,
-        vals=packed_zp.flatten().tobytes(),
+        vals=zero_point.flatten().tobytes(),
         raw=True,
     )
     input_names.append(zp_tensor.name)
@@ -578,10 +612,12 @@ def dump_woq_stats(model, quantize_config):
 
         if optype not in res:
             res[optype] = {}
-        if re.fullmatch("^.*_Q\d*G\d*", node.input[1]):
-            search_out = re.search("_Q\d*", node.input[1])
+        if re.match("^.*_Q\d*G\d*", node.input[1]):
+            Q_position = re.search("_Q\d*", node.input[1])
+            full_position = re.search("_Q\d*G\d*", node.input[1])
             dtype = "A32W{}G{}".format(
-                node.input[1][search_out.start() + 2 : search_out.end()], node.input[1][search_out.end() + 1 :]
+                node.input[1][Q_position.start() + 2 : Q_position.end()],
+                node.input[1][Q_position.end() + 1 : full_position.end()],
             )
         else:
             dtype = "FP32"
diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py
index 81d896288..33eea54c8 100644
--- a/onnx_neural_compressor/algorithms/weight_only/awq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/awq.py
@@ -63,6 +63,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts):
         weight = []
         org_out = []
 
+        weight_dtype = weight_config[nodes[0].name].get("weight_dtype", "int")
         num_bits = weight_config[nodes[0].name].get("weight_bits", 4)
         group_size = weight_config[nodes[0].name].get("weight_group_size", 32)
         sym = weight_config[nodes[0].name].get("weight_sym", True)
@@ -70,6 +71,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts):
 
         # use same params for all children of one parent
         for node in nodes:
+            weight_config.setdefault(node.name, {}).update({"weight_dtype": weight_dtype})
             weight_config.setdefault(node.name, {}).update({"weight_bits": num_bits})
             weight_config.setdefault(node.name, {}).update({"weight_group_size": group_size})
             weight_config.setdefault(node.name, {}).update({"weight_sym": sym})
@@ -98,24 +100,11 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts):
                 weight = weight.T * scales
                 weight = quant_utils.pad_tensor(weight.T, group_size, (org_w_shape[0] + group_size - 1) // group_size)
 
-                if (version.Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4) or (
-                    version.Version(ort.__version__) >= constants.ONNXRT116_VERSION
-                    and num_bits == 4
-                    and group_size == 32
-                ):
-                    # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
-                    # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    q_weight = quant_utils.qdq_data(
-                        weight.reshape((-1, group_size)),
-                        "uint" + str(num_bits),
-                        sym,
-                    ).reshape(weight.shape)
-                else:
-                    q_weight = quant_utils.qdq_data(
-                        weight.reshape((-1, group_size)),
-                        "int" + str(num_bits),
-                        sym,
-                    ).reshape(weight.shape)
+                q_weight = quant_utils.qdq_data(
+                    weight.reshape((-1, group_size)),
+                    weight_dtype + str(num_bits),
+                    sym,
+                ).reshape(weight.shape)
 
                 q_weight = q_weight[: org_w_shape[0], :] / np.expand_dims(scales, axis=-1)
                 out = np.matmul(inp, q_weight)
@@ -237,6 +226,7 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts):
         inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0)
 
         for node in nodes:
+            weight_dtype = weight_config[node.name].get("weight_dtype", "int")
             num_bits = weight_config[node.name].get("weight_bits", 4)
             group_size = weight_config[node.name].get("weight_group_size", 32)
             sym = weight_config[node.name].get("weight_sym", True)
@@ -256,26 +246,12 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts):
             for i_s in range(10):
                 ratio = 1 - i_s / 100
                 weight = copy.deepcopy(org_weight)
-                if (version.Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4) or (
-                    version.Version(ort.__version__) >= constants.ONNXRT116_VERSION
-                    and num_bits == 4
-                    and group_size == 32
-                ):
-                    # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
-                    # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    weight = quant_utils.qdq_data(
-                        weight.reshape((-1, group_size)),
-                        "uint" + str(num_bits),
-                        sym,
-                        ratio=ratio,
-                    ).reshape(org_weight.shape)
-                else:
-                    weight = quant_utils.qdq_data(
-                        weight.reshape((-1, group_size)),
-                        "int" + str(num_bits),
-                        sym,
-                        ratio=ratio,
-                    ).reshape(org_weight.shape)
+                weight = quant_utils.qdq_data(
+                    weight.reshape((-1, group_size)),
+                    weight_dtype + str(num_bits),
+                    sym,
+                    ratio=ratio,
+                ).reshape(org_weight.shape)
 
                 cur_out = np.matmul(inp, weight[:, : org_w_shape[0]].T)
                 loss = np.mean(np.power((org_out - cur_out), 2))
@@ -294,6 +270,7 @@ def awq_quantize(
     enable_auto_scale: bool = True,
     enable_mse_search: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
+    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Quant the model with Activation-aware Weight quantization(AWQ) method.
 
@@ -317,6 +294,7 @@ def awq_quantize(
         enable_mse_search (bool, optional): whether to search for the best clip range from range
             [0.91, 1.0, 0.01]. Defaults to True.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
+        quant_format (int, optional): use Qoperator or QDQ format. 0 means Qoperator, 1 means QDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -336,12 +314,12 @@ def awq_quantize(
         output_names = []
         for node in model.nodes():
             # check op_type of node is MatMul
+            # check op_name in quantization config
             # check dim 1 of input is weight tensor
-            # check weight_type is not "fp32"
             if (
                 node.op_type in ["MatMul"]
+                and node.name in weight_config
                 and model.get_initializer(node.input[1]) is not None
-                and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
             ):
                 output_names.append(node.input[0])
         output_names = list(set(output_names))
@@ -371,12 +349,12 @@ def awq_quantize(
 
             for node in input_name_to_nodes[input_name]:
                 # check op_type of node is MatMul
+                # check op_name in quantization config
                 # check dim 1 of input is weight tensor
-                # check weight_type is not "fp32"
                 if (
                     node.op_type in ["MatMul"]
+                    and node.name in weight_config
                     and model.get_initializer(node.input[1]) is not None
-                    and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
                 ):
                     dump_pairs[parent].append(model.get_node(node.name))
 
@@ -408,7 +386,13 @@ def awq_quantize(
 
         model.remove_tensors_from_outputs(output_names)
         model.model.graph.output.MergeFrom(org_output)
-    model = rtn.rtn_quantize(model, weight_config, full_ratio, providers)
+    model = rtn.rtn_quantize(
+        model=model,
+        weight_config=weight_config,
+        ratios=full_ratio,
+        providers=providers,
+        quant_format=quant_format,
+    )
     return model
 
 
@@ -419,6 +403,7 @@ def apply_awq_on_model(
     enable_auto_scale: bool = True,
     enable_mse_search: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
+    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Apply Activation-aware Weight quantization(AWQ) on onnx model.
 
@@ -426,6 +411,7 @@ def apply_awq_on_model(
         model (Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str]): nnx model.
         quant_config (dict): quantization config.
         calibration_data_reader (data_reader.CalibrationDataReader): data_reader for calibration.
+        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 meansQDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -435,6 +421,7 @@ def apply_awq_on_model(
         "enable_auto_scale": enable_auto_scale,
         "enable_mse_search": enable_mse_search,
         "providers": providers,
+        "quant_format": quant_format,
     }
     q_model = awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs)
     quant_utils.dump_woq_stats(q_model, quant_config)
diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
index a6ed82e9b..a3c639bb1 100644
--- a/onnx_neural_compressor/algorithms/weight_only/gptq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -31,6 +31,8 @@
 
 from typing import List, Union  # isort: skip
 
+ort_version = Version(ort.__version__)
+
 
 def _gptq(
     W: np.array,
@@ -185,6 +187,7 @@ def gptq_quantize(
     mse: bool = False,
     perchannel: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
+    quant_format: int = 0,
     return_modelproto: bool = True,
 ):
     """Quant the model with GPTQ method.
@@ -211,6 +214,7 @@ def gptq_quantize(
         mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
         perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
+        quant_format (int, optional): using QOperator or QDQ format. 0 means QOperator, 1 meansQDQ. Default is 0.
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
 
@@ -228,12 +232,12 @@ def gptq_quantize(
     output_names = []
     for node in model.nodes():
         # check op_type of node is MatMul
+        # check op_name in quantization config
         # check dim 1 of input is weight tensor
-        # check weight_type is not "fp32"
         if (
             node.op_type in ["MatMul"]
+            and node.name in weight_config
             and model.get_initializer(node.input[1]) is not None
-            and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
         ):
             output_names.append(node.input[0])
     output_names = list(set(output_names))
@@ -262,12 +266,12 @@ def gptq_quantize(
 
         for node in input_name_to_nodes[input_name]:
             # check op_type of node is MatMul
+            # check op_name in quantization config
             # check dim 1 of input is weight tensor
-            # check weight_type is not "fp32"
             if (
                 node.op_type in ["MatMul"]
+                and node.name in weight_config
                 and model.get_initializer(node.input[1]) is not None
-                and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
             ):
                 weight = onnx.numpy_helper.to_array(
                     model.get_initializer(model.get_node(node.name).input[1]), base_dir
@@ -297,6 +301,7 @@ def gptq_quantize(
             weight,
             H,
         ) in zip(node_list, weights, Hs):
+            weight_dtype = weight_config[node.name].get("weight_dtype", "int")
             num_bits = weight_config[node.name].get("weight_bits", 4)
             group_size = weight_config[node.name].get("weight_group_size", 32)
             sym = weight_config[node.name].get("weight_sym", True)
@@ -304,6 +309,7 @@ def gptq_quantize(
             group_size = group_size if group_size != -1 else weight.shape[0]
             dtype = weight.dtype
 
+            # weight -> quant -> dequant -> q_weight
             q_weight = _gptq(
                 weight,
                 H,
@@ -318,40 +324,50 @@ def gptq_quantize(
             )
 
             weight_tensor = model.get_initializer(node.input[1])
+            org_shape = weight.shape
             init_share_num = model.get_initializer_share_num(node.input[1])
 
-            satisfy_MatMulNBits_condition = Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4
+            satisfy_MatMulNBits_condition = ort_version > constants.ONNXRT1161_VERSION and num_bits == 4
             satisfy_MatMulFpQ4_condition = (
-                Version(ort.__version__) >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
+                ort_version >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
             )
-            if model.opset_import[0].version > 20:
-                q_weight, scale, zp = woq_utility.quant_tensor(
-                    weight.T, num_bits, group_size, scheme, "uint"
+            if (
+                quant_format == 1  # QDQ format
+                and num_bits in [4, 8]
+                and ort_version >= constants.ONNXRT119_VERSION
+                and model.opset_import[0].version > 20
+            ):
+                _, _, zp, scale, q_weight = quant_utils.quantize_data(
+                    weight.T.reshape((-1, group_size)),
+                    weight_dtype + str(num_bits),
+                    sym,
+                    axis=1,
                 )
-                k_blocks = (org_shape[0] + group_size - 1) // group_size
-                dequant_node, new_inits = woq_utility.make_weight_only_dequant_node(
+                dequant_node, new_inits = quant_utils.make_weight_only_dequant_node(
                     node=node,
-                    q_weight=q_weight.astype("uint8"),
-                    k_blocks=k_blocks,
-                    scale=scale.astype(dtype),
-                    axis=1,
+                    weight_shape=org_shape,
+                    num_bits=num_bits,
+                    dtype=weight_dtype,
+                    q_weight=q_weight,
+                    scale=scale.astype(weight.dtype),
+                    axis=0,
                     block_size=group_size,
-                    zero_point=zp if scheme == "asym" else None,
+                    zero_point=zp,
                 )
                 model.add_initializers(new_inits)
                 model.add_node(dequant_node)
+                node.name += "_Q"
             elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
                 "CUDAExecutionProvider" not in providers
                 and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
             ):
                 # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
                 # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
-                org_shape = weight.shape
                 k_blocks = (org_shape[0] + group_size - 1) // group_size
                 q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks)
                 _, _, zp, scale, q_weight = quant_utils.quantize_data(
                     q_weight.T.reshape((-1, group_size)),
-                    "uint" + str(num_bits),
+                    weight_dtype + str(num_bits),
                     sym,
                     axis=1,
                 )
@@ -411,6 +427,7 @@ def apply_gptq_on_model(
     perchannel: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
     layer_wise_quant: bool = False,
+    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Apply GPTQ on onnx model.
 
@@ -418,6 +435,7 @@ def apply_gptq_on_model(
         model (Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str]): onnx model.
         quant_config (dict): quantization config.
         calibration_data_reader (data_reader.CalibrationDataReader): data_reader for calibration.
+        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 meansQDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -430,6 +448,7 @@ def apply_gptq_on_model(
         "mse": mse,
         "perchannel": perchannel,
         "providers": providers,
+        "quant_format": quant_format,
     }
 
     if layer_wise_quant:
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index f1cd180f4..ef2105731 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -29,6 +29,8 @@
 
 from typing import List, Union  # isort: skip
 
+ort_version = version.Version(ort.__version__)
+
 
 def rtn_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
@@ -56,6 +58,7 @@ def rtn_quantize(
             }. Defaults to {}.
         ratios (dict, optional): percentile of clip. Defaults to {}.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
+        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 means QDQ. Default is 0.
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
     Returns:
@@ -74,19 +77,19 @@ def rtn_quantize(
             utility.simple_progress_bar(total_num, curr_id)
 
         # check op_type of node is MatMul
+        # check op_name in quantization config
         # check dim 1 of input is weight tensor
-        # check weight_type is not "fp32"
         if (
-            node.op_type in ["MatMul"]  # check op_type of node is MatMul
+            node.op_type in ["MatMul"]
+            and node.name in weight_config
             and model.get_initializer(node.input[1]) is not None
-            and weight_config.get(node.name, {}).get("weight_dtype", "fp32") != "fp32"
         ):
             weight_tensor = model.get_initializer(node.input[1])
             weight = onnx.numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
             if len(weight.shape) != 2:
                 continue
 
-            dtype = weight.dtype
+            dtype = weight_config[node.name].get("weight_dtype", "int")
             num_bits = weight_config[node.name].get("weight_bits", 4)
             group_size = weight_config[node.name].get("weight_group_size", 32)
             sym = weight_config[node.name].get("weight_sym", True)
@@ -100,33 +103,37 @@ def rtn_quantize(
 
             weight = quant_utils.pad_tensor(weight, group_size, k_blocks)
 
-            satisfy_MatMulNBits_condition = (
-                version.Version(ort.__version__) > constants.ONNXRT1161_VERSION and num_bits == 4
-            )
+            satisfy_MatMulNBits_condition = ort_version > constants.ONNXRT1161_VERSION and num_bits == 4
             satisfy_MatMulFpQ4_condition = (
-                version.Version(ort.__version__) >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
+                ort_version >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
             )
-            if quant_format == 1:
-                _, _, zp, scale, q_weight =quant_utils.quantize_data(
+            if (
+                quant_format == 1  # QDQ format
+                and num_bits in [4, 8]
+                and ort_version >= constants.ONNXRT119_VERSION
+                and model.opset_import[0].version > 20
+            ):
+                _, _, zp, scale, q_weight = quant_utils.quantize_data(
                     weight.T.reshape((-1, group_size)),
-                    "uint" + str(num_bits),
-                    False,
+                    dtype + str(num_bits),
+                    sym,
                     ratio=ratios.get(node.input[1], 1),
                     axis=1,
                 )
-                dequant_node, new_inits =quant_utils.make_weight_only_dequant_node(
+                dequant_node, new_inits = quant_utils.make_weight_only_dequant_node(
                     node=node,
                     weight_shape=org_w_shape,
                     num_bits=num_bits,
-                    k_blocks=k_blocks,
-                    q_weight=q_weight.reshape((-1, org_w_shape[-1])),
-                    scale=scale.reshape((org_w_shape[-1], -1)).T.astype(dtype),
+                    dtype=dtype,
+                    q_weight=q_weight,
+                    scale=scale.astype(weight.dtype),
                     axis=0,
                     block_size=group_size,
-                    zero_point=zp.reshape((org_w_shape[-1], -1)).T,
+                    zero_point=zp,
                 )
                 model.add_initializers(new_inits)
                 new_nodes.append(dequant_node)
+                node.name += "_Q"
             elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
                 "CUDAExecutionProvider" not in providers
                 and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
@@ -135,7 +142,7 @@ def rtn_quantize(
                 # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
                 _, _, zp, scale, q_weight = quant_utils.quantize_data(
                     weight.T.reshape((-1, group_size)),
-                    "uint" + str(num_bits),
+                    dtype + str(num_bits),
                     sym,
                     ratio=ratios.get(node.input[1], 1),
                     axis=1,
@@ -147,7 +154,7 @@ def rtn_quantize(
                     group_size=group_size,
                     k_blocks=k_blocks,
                     q_weight=q_weight,
-                    scale=scale.astype(dtype),
+                    scale=scale.astype(weight.dtype),
                     zero_point=zp if not sym else None,
                     accuracy_level=accuracy_level,
                 )
@@ -155,20 +162,20 @@ def rtn_quantize(
                 model.add_initializers(new_inits)
                 remove_nodes.append(node)
                 new_nodes.append(q_matmul_node)
-            else:
+            else:  # fake quant
                 q_weight = quant_utils.qdq_data(
                     weight.T.reshape((-1, group_size)),
-                    "int" + str(num_bits),
+                    dtype + str(num_bits),
                     sym,
                     ratio=ratios.get(node.input[1], 1),
                     axis=1,
                 )
                 q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
                 q_weight = np.transpose(q_weight)
-                q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
+                q_weight = q_weight[: org_w_shape[0], :].astype(weight.dtype)
                 q_weight_tensor = onnx.helper.make_tensor(
                     name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
+                    data_type=onnx.helper.np_dtype_to_tensor_dtype(q_weight.dtype),
                     dims=weight.shape,
                     vals=q_weight.tobytes(),
                     raw=True,
@@ -206,7 +213,7 @@ def apply_rtn_on_model(
     Args:
         model (Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str]): onnx model.
         quant_config (dict): quantization config.
-        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 means QDQ. Default 0.
+        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 means QDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
diff --git a/onnx_neural_compressor/constants.py b/onnx_neural_compressor/constants.py
index 71caf2a49..54889bda0 100644
--- a/onnx_neural_compressor/constants.py
+++ b/onnx_neural_compressor/constants.py
@@ -38,6 +38,7 @@
 
 ONNXRT116_VERSION = version.Version("1.16.0")
 ONNXRT1161_VERSION = version.Version("1.16.1")
+ONNXRT119_VERSION = version.Version("1.19.0")
 
 PRIORITY_RTN = 60
 PRIORITY_GPTQ = 70
diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py
index 5488615e5..74ce14c55 100644
--- a/onnx_neural_compressor/onnx_model.py
+++ b/onnx_neural_compressor/onnx_model.py
@@ -136,6 +136,7 @@ def is_large_model(self):
         """Check the onnx model is over 2GB."""
         return self._is_large_model
 
+    @property
     def framework(self):
         """Return framework."""
         return "onnxruntime"
@@ -201,10 +202,12 @@ def graph(self):
         """Return model graph."""
         return self._model.graph
 
+    @property
     def ir_version(self):
         """Return model ir_version."""
         return self._model.ir_version
 
+    @property
     def opset_import(self):
         """Return model opset_import."""
         return self._model.opset_import
@@ -870,7 +873,9 @@ def _build_input_output_tensor(self, tensor_name, value_info):
         tensor_type = value_info.get(tensor_name, onnx.TensorProto.FLOAT)
         return onnx.helper.make_tensor_value_info(tensor_name, tensor_type, None)
 
-    def split_model_with_node(self, split_node_name, path_of_model_to_split, save_both_split_models=True):
+    def split_model_with_node(
+        self, split_node_name, path_of_model_to_split, save_both_split_models=True, save_path=None
+    ):
         """Split model into two parts at a given node.
 
         Args:
@@ -880,6 +885,7 @@ def split_model_with_node(self, split_node_name, path_of_model_to_split, save_bo
                 False means only save the first split model.
                 True means save both the two split models.
                 Default id True.
+            save_path (str): path to save split models. None means using self.model_path
 
         Returns:
             tuple: the first split model, the second split model
@@ -971,7 +977,11 @@ def split_model_with_node(self, split_node_name, path_of_model_to_split, save_bo
         dir_of_model_to_split = os.path.dirname(path_of_model_to_split)
 
         split_model_part_1.load_model_initializer_by_tensor(dir_of_model_to_split)
-        split_model_part_1_path = os.path.join(dir_of_model_to_split, "split_model_part_1.onnx")
+        split_model_part_1_path = (
+            os.path.join(save_path, "split_model_part_1.onnx")
+            if save_path is not None
+            else os.path.join(dir_of_model_to_split, "split_model_part_1.onnx")
+        )
         split_model_part_1.model_path = split_model_part_1_path
         split_model_part_1._save_split_model(split_model_part_1_path)
         split_model_part_1.check_is_large_model()
@@ -979,7 +989,11 @@ def split_model_with_node(self, split_node_name, path_of_model_to_split, save_bo
 
         if save_both_split_models:
             split_model_part_2.load_model_initializer_by_tensor(dir_of_model_to_split)
-            split_model_part_2_path = os.path.join(dir_of_model_to_split, "split_model_part_2.onnx")
+            split_model_part_2_path = (
+                os.path.join(save_path, "split_model_part_2.onnx")
+                if save_path is not None
+                else os.path.join(dir_of_model_to_split, "split_model_part_2.onnx")
+            )
             split_model_part_2.model_path = split_model_part_2_path
             split_model_part_2._save_split_model(split_model_part_2_path)
             split_model_part_2.check_is_large_model()
@@ -996,6 +1010,7 @@ def _save_split_model(self, save_path):
         """
         if os.path.exists(save_path + "_data"):
             os.remove(save_path + "_data")
+        self._model_path = save_path
         onnx.save_model(
             self.model,
             save_path,
diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index 45b9cb73d..560b14292 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -69,8 +69,11 @@ def gptq_quantize_entry(
         logger.debug(config_mapping)
     else:
         config_mapping = quant_config.config_mapping
+
     quant_kwargs = {}
-    quant_kwargs = {key: getattr(quant_config, key) for key in config.GPTQConfig.model_params_list}
+    for key in config.GPTQConfig.model_params_list:
+        val = getattr(quant_config, key)
+        quant_kwargs[key] = getattr(val, "value", val)
 
     # regenerate to ensure data exists
     calibration_data_reader.rewind()
@@ -100,8 +103,11 @@ def awq_quantize_entry(
         logger.debug(config_mapping)
     else:
         config_mapping = quant_config.config_mapping
+
     quant_kwargs = {}
-    quant_kwargs = {key: getattr(quant_config, key) for key in config.AWQConfig.model_params_list}
+    for key in config.AWQConfig.model_params_list:
+        val = getattr(quant_config, key)
+        quant_kwargs[key] = getattr(val, "value", val)
 
     # regenerate to ensure data exists
     calibration_data_reader.rewind()
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index c5836ac26..6522d0522 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -711,11 +711,95 @@ class _OperatorConfig(NamedTuple):
     valid_func_list: List[Callable] = []
 
 
+class BaseWeightOnlyConfig(BaseConfig):
+    """Base config class for weight-only quantization."""
+
+    def __init__(
+        self,
+        weight_dtype: bool = "int",
+        weight_bits: int = 4,
+        weight_group_size: int = 32,
+        weight_sym: bool = True,
+        act_dtype: str = "fp32",
+        accuracy_level: int = 0,
+        providers: List[str] = ["CPUExecutionProvider"],
+        quant_last_matmul: bool = True,
+        quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
+        nodes_to_exclude: list = [],
+        white_list: List[Union[str, Callable]] = constants.EMPTY_WHITE_LIST,
+    ):
+        """Initialize weight-only quantization config.
+
+        Args:
+            weight_dtype (str, optional): Data type for weights, support "uint" and "int", default is "int".
+            weight_bits (int, optional): Number of bits used to represent weights, default is 4.
+            weight_group_size (int, optional): Size of weight groups, default is 32.
+            weight_sym (bool, optional): Indicates whether weights are symmetric, default is True.
+            act_dtype (str, optional): Data type for activations, default is "fp32".
+            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                4 (int8 compute type of jblas kernel). Defaults to 0.
+            ratios (dict, optional): percentile of clip. Defaults to {}.
+            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
+            quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
+            quant_format (QuantFormat, optional): use QOperator or QDQ format, default is QOperator.
+            nodes_to_exclude (list, optional): nodes in nodes_to_exclude list will be skipped during quantization.
+            white_list (list, optional): op in white_list will be applied current config.
+                Defaults to constants.DEFAULT_WHITE_LIST.
+        """
+        super().__init__(white_list=white_list)
+        self.weight_bits = weight_bits
+        self.weight_dtype = weight_dtype
+        self.weight_group_size = weight_group_size
+        self.weight_sym = weight_sym
+        self.act_dtype = act_dtype
+        self.accuracy_level = accuracy_level
+        self.providers = providers
+        self.quant_last_matmul = quant_last_matmul
+        self.quant_format = quant_format
+        self.nodes_to_exclude = nodes_to_exclude
+
+    def get_model_params_dict(self):
+        result = dict()
+        for param in self.model_params_list:
+            result[param] = getattr(self, param)
+        return result
+
+    def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None):
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            # update model level setting
+            self._config_mapping.update(config.get_model_params_dict())
+
+            # update node level setting
+            last_matmul = None
+            global_config = config.get_params_dict()
+            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
+            for op_name, op_type in model_info:
+                if op_name in self.nodes_to_exclude:
+                    continue
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if global_config is not None:
+                    self._config_mapping[op_name] = global_config
+                if op_type in op_type_config_dict:
+                    self._config_mapping[op_name] = op_type_config_dict[op_type]
+                for op_name_pattern in op_name_config_dict:
+                    if re.match(op_name_pattern, op_name):
+                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
+                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
+        return self._config_mapping
+
+
 ######################## RNT Config ###############################
 
 
 @register_config(algo_name=constants.RTN, priority=constants.PRIORITY_RTN)
-class RTNConfig(BaseConfig):
+class RTNConfig(BaseWeightOnlyConfig):
     """Config class for round-to-nearest weight-only quantization."""
 
     supported_configs: List[_OperatorConfig] = []
@@ -748,12 +832,13 @@ def __init__(
         layer_wise_quant: bool = False,
         quant_last_matmul: bool = True,
         quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
+        nodes_to_exclude: list = [],
         white_list: List[Union[str, Callable]] = constants.RTN_OP_LIST,
     ):
         """Init RTN weight-only quantization config.
 
         Args:
-            weight_dtype (str, optional): Data type for weights, default is "int".
+            weight_dtype (str, optional): Data type for weights, support "uint" and "int", default is "int".
             weight_bits (int, optional): Number of bits used to represent weights, default is 4.
             weight_group_size (int, optional): Size of weight groups, default is 32.
             weight_sym (bool, optional): Indicates whether weights are symmetric, default is True.
@@ -768,21 +853,26 @@ def __init__(
                 https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
                 default is False.
             quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
+            quant_format (QuantFormat, optional): use QOperator or QDQ format, default is QOperator.
+            nodes_to_exclude (list, optional): nodes in nodes_to_exclude list will be skipped during quantization.
             white_list (list, optional): op in white_list will be applied current config.
                 Defaults to constants.DEFAULT_WHITE_LIST.
         """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
-        self.ratios = ratios
-        self.providers = providers
+        super().__init__(
+            weight_bits=weight_bits,
+            weight_dtype=weight_dtype,
+            weight_group_size=weight_group_size,
+            weight_sym=weight_sym,
+            act_dtype=act_dtype,
+            accuracy_level=accuracy_level,
+            providers=providers,
+            quant_last_matmul=quant_last_matmul,
+            quant_format=quant_format,
+            nodes_to_exclude=nodes_to_exclude,
+            white_list=white_list,
+        )
         self.layer_wise_quant = layer_wise_quant
-        self.quant_last_matmul = quant_last_matmul
-        self.quant_format = quant_format
+        self.ratios = ratios
         self._post_init()
 
     def _post_init(self):
@@ -797,12 +887,6 @@ def _post_init(self):
         elif self.white_list == constants.EMPTY_WHITE_LIST:
             return
 
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
     @classmethod
     def register_supported_configs(cls) -> None:
         supported_configs = []
@@ -817,33 +901,6 @@ def register_supported_configs(cls) -> None:
         supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators))
         cls.supported_configs = supported_configs
 
-    def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None):
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            self._config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            last_matmul = None
-            global_config = config.get_params_dict()
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if op_type == "MatMul":
-                    last_matmul = op_name
-                if global_config is not None:
-                    self._config_mapping[op_name] = global_config
-                if op_type in op_type_config_dict:
-                    self._config_mapping[op_name] = op_type_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
-                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
-                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
-        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
-            del self._config_mapping[last_matmul]
-        return self._config_mapping
-
     @staticmethod
     def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.RTN_OP_LIST) -> list:
         if not isinstance(model, onnx.ModelProto):
@@ -875,7 +932,7 @@ def get_default_rtn_config() -> RTNConfig:
 
 
 @register_config(algo_name=constants.GPTQ, priority=constants.PRIORITY_GPTQ)
-class GPTQConfig(BaseConfig):
+class GPTQConfig(BaseWeightOnlyConfig):
     """Config class for gptq weight-only quantization."""
 
     supported_configs: List[_OperatorConfig] = []
@@ -895,6 +952,7 @@ class GPTQConfig(BaseConfig):
         "perchannel",
         "providers",
         "layer_wise_quant",
+        "quant_format",
     ]
     name: str = constants.GPTQ
 
@@ -914,6 +972,8 @@ def __init__(
         providers: List[str] = ["CPUExecutionProvider"],
         layer_wise_quant: bool = False,
         quant_last_matmul: bool = True,
+        quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
+        nodes_to_exclude: list = [],
         white_list: List[Union[str, Callable]] = constants.GPTQ_OP_LIST,
     ):
         """Init GPTQ weight-only quantization config.
@@ -940,24 +1000,30 @@ def __init__(
                 https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
                 default is False.
             quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
+            quant_format (QuantFormat, optional): use QOperator or QDQ format, default is QOperator.
+            nodes_to_exclude (list, optional): nodes in nodes_to_exclude list will be skipped during quantization.
             white_list (list, optional): op in white_list will be applied current config.
                 Defaults to constants.DEFAULT_WHITE_LIST.
         """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
+        super().__init__(
+            weight_bits=weight_bits,
+            weight_dtype=weight_dtype,
+            weight_group_size=weight_group_size,
+            weight_sym=weight_sym,
+            act_dtype=act_dtype,
+            accuracy_level=accuracy_level,
+            providers=providers,
+            quant_last_matmul=quant_last_matmul,
+            quant_format=quant_format,
+            nodes_to_exclude=nodes_to_exclude,
+            white_list=white_list,
+        )
         self.percdamp = percdamp
         self.block_size = block_size
         self.actorder = actorder
         self.mse = mse
         self.perchannel = perchannel
-        self.providers = providers
         self.layer_wise_quant = layer_wise_quant
-        self.quant_last_matmul = quant_last_matmul
         self._post_init()
 
     def _post_init(self):
@@ -972,12 +1038,6 @@ def _post_init(self):
         elif self.white_list == constants.EMPTY_WHITE_LIST:
             return
 
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
     @classmethod
     def register_supported_configs(cls) -> None:
         supported_configs = []
@@ -995,33 +1055,6 @@ def register_supported_configs(cls) -> None:
         supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators))
         cls.supported_configs = supported_configs
 
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            self._config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            last_matmul = None
-            global_config = config.get_params_dict()
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if op_type == "MatMul":
-                    last_matmul = op_name
-                if global_config is not None:
-                    self._config_mapping[op_name] = global_config
-                if op_type in op_type_config_dict:
-                    self._config_mapping[op_name] = op_type_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
-                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
-                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
-        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
-            del self._config_mapping[last_matmul]
-        return self._config_mapping
-
     @staticmethod
     def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.GPTQ_OP_LIST) -> list:
         if not isinstance(model, onnx.ModelProto):
@@ -1059,7 +1092,7 @@ def get_default_gptq_config() -> GPTQConfig:
 
 
 @register_config(algo_name=constants.AWQ, priority=constants.PRIORITY_AWQ)
-class AWQConfig(BaseConfig):
+class AWQConfig(BaseWeightOnlyConfig):
     """Config class for awq weight-only quantization."""
 
     supported_configs: List[_OperatorConfig] = []
@@ -1075,6 +1108,7 @@ class AWQConfig(BaseConfig):
         "enable_auto_scale",
         "enable_mse_search",
         "providers",
+        "quant_format",
     ]
     name: str = constants.AWQ
 
@@ -1090,6 +1124,8 @@ def __init__(
         enable_mse_search: bool = True,
         providers: List[str] = ["CPUExecutionProvider"],
         quant_last_matmul: bool = True,
+        quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
+        nodes_to_exclude: list = [],
         white_list: List[Union[str, Callable]] = constants.AWQ_OP_LIST,
     ):
         """Init AWQ weight-only quantization config.
@@ -1109,24 +1145,30 @@ def __init__(
                 [0.91, 1.0, 0.01]. Defaults to True.
             providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
             quant_last_matmul (bool, optional): whether to quantize the last matmul of the model, default is True.
+            quant_format (QuantFormat, optional): use QOperator or QDQ format, default is QOperator.
+            nodes_to_exclude (list, optional): nodes in nodes_to_exclude list will be skipped during quantization.
             white_list (list, optional): op in white_list will be applied current config.
                 Defaults to constants.DEFAULT_WHITE_LIST.
         """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
+        super().__init__(
+            weight_bits=weight_bits,
+            weight_dtype=weight_dtype,
+            weight_group_size=weight_group_size,
+            weight_sym=weight_sym,
+            act_dtype=act_dtype,
+            accuracy_level=accuracy_level,
+            providers=providers,
+            quant_last_matmul=quant_last_matmul,
+            quant_format=quant_format,
+            nodes_to_exclude=nodes_to_exclude,
+            white_list=white_list,
+        )
         self.enable_auto_scale = enable_auto_scale
         self.enable_mse_search = enable_mse_search
-        self.providers = providers
-        self.quant_last_matmul = quant_last_matmul
         self._post_init()
 
     def _post_init(self):
-        if self.white_list == constants.GPTQ_OP_LIST:
+        if self.white_list == constants.AWQ_OP_LIST:
             global_config = self.get_init_args()
             self._global_config = self.__class__(**global_config, white_list=None)
         elif isinstance(self.white_list, list) and len(self.white_list) > 0:
@@ -1137,12 +1179,6 @@ def _post_init(self):
         elif self.white_list == constants.EMPTY_WHITE_LIST:
             return
 
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
     @classmethod
     def register_supported_configs(cls) -> List[_OperatorConfig]:
         supported_configs = []
@@ -1159,33 +1195,6 @@ def register_supported_configs(cls) -> List[_OperatorConfig]:
         supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators))
         cls.supported_configs = supported_configs
 
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            self._config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            last_matmul = None
-            global_config = config.get_params_dict()
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if op_type == "MatMul":
-                    last_matmul = op_name
-                if global_config is not None:
-                    self._config_mapping[op_name] = global_config
-                if op_type in op_type_config_dict:
-                    self._config_mapping[op_name] = op_type_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
-                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
-                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
-        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
-            del self._config_mapping[last_matmul]
-        return self._config_mapping
-
     @staticmethod
     def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.AWQ_OP_LIST) -> list:
         if not isinstance(model, onnx.ModelProto):
@@ -1221,17 +1230,17 @@ def get_default_awq_config() -> AWQConfig:
 ######################## WOQ Tuning Config ###############################
 
 
-def get_woq_tuning_config() -> list:
+def get_woq_tuning_config(quant_format=quantization.QuantFormat.QOperator) -> list:
     """Generate the config set for WOQ tuning.
 
     Returns:
         the list of WOQ quant config.
     """
-    RTN_G32ASYM = RTNConfig(weight_sym=False)
-    GPTQ_G32ASYM = GPTQConfig(weight_sym=False)
-    GPTQ_G32ASYM_DISABLE_LAST_MATMUL = GPTQConfig(weight_sym=False, quant_last_matmul=False)
-    GPTQ_G128ASYM = GPTQConfig(weight_group_size=128, weight_sym=False)
-    AWQ_G32ASYM = AWQConfig(weight_sym=False)
+    RTN_G32ASYM = RTNConfig(weight_sym=False, quant_format=quant_format)
+    GPTQ_G32ASYM = GPTQConfig(weight_sym=False, quant_format=quant_format)
+    GPTQ_G32ASYM_DISABLE_LAST_MATMUL = GPTQConfig(weight_sym=False, quant_last_matmul=False, quant_format=quant_format)
+    GPTQ_G128ASYM = GPTQConfig(weight_group_size=128, weight_sym=False, quant_format=quant_format)
+    AWQ_G32ASYM = AWQConfig(weight_sym=False, quant_format=quant_format)
     return [RTN_G32ASYM, GPTQ_G32ASYM, GPTQ_G32ASYM_DISABLE_LAST_MATMUL, GPTQ_G128ASYM, AWQ_G32ASYM]
 
 
diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
index 41c58a29f..78f2cb88a 100644
--- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
@@ -31,6 +31,7 @@ def __init__(
         model: Union[onnx.ModelProto, str],
         block_size: int = 128,
         is_symmetric: bool = False,
+        is_signed: bool = False,
         accuracy_level: int = 0,
         nodes_to_exclude=None,
         algo_config: matmul_nbits_quantizer.WeightOnlyQuantConfig = None,
@@ -41,6 +42,7 @@ def __init__(
             model=model,
             block_size=block_size,
             is_symmetric=is_symmetric,
+            is_signed=is_signed,
             accuracy_level=accuracy_level,
             nodes_to_exclude=nodes_to_exclude,
             algo_config=algo_config,
diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
index e9c3dfb03..11822c0fc 100644
--- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
@@ -21,8 +21,9 @@
 import onnxruntime as ort
 
 from onnx_neural_compressor import data_reader, logger, onnx_model, utility
+from onnx_neural_compressor.quantization import QuantFormat
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import config, QuantFormat
+from onnx_neural_compressor.quantization import config
 
 
 class WeightOnlyQuantConfig:
@@ -105,6 +106,7 @@ def __init__(
         model: Union[onnx.ModelProto, str],
         block_size: int = 128,
         is_symmetric: bool = False,
+        is_signed: bool = False,
         accuracy_level: int = 0,
         nodes_to_exclude: List[str] = None,
         algo_config: WeightOnlyQuantConfig = None,
@@ -117,6 +119,7 @@ def __init__(
         self.model = model
         self.block_size = block_size
         self.is_symmetric = is_symmetric
+        self.is_signed = is_signed
         self.accuracy_level = accuracy_level
         self.nodes_to_exclude = list(set(nodes_to_exclude))
         self.algo_config = algo_config or RTNWeightOnlyQuantConfig()
@@ -133,12 +136,14 @@ def __init__(
     def _generate_nc_config(self):
         config_class = config.config_registry.get_cls_configs()[self.algorithm.lower()]
         quant_kwargs = {
+            "weight_dtype": "int" if self.is_signed else "uint",
             "weight_bits": self.n_bits,
             "weight_group_size": self.block_size,
             "weight_sym": self.is_symmetric,
             "accuracy_level": self.accuracy_level,
             "providers": self.providers,
             "quant_format": self.algo_config.quant_format,
+            "nodes_to_exclude": self.nodes_to_exclude,
         }
         if self.algorithm == "RTN":
             quant_kwargs.update(
@@ -166,10 +171,6 @@ def _generate_nc_config(self):
             )
         nc_config = config_class(**quant_kwargs)
 
-        if len(self.nodes_to_exclude) > 0:
-            not_quant_kwargs = {"weight_dtype": "fp32", "white_list": self.nodes_to_exclude}
-            nc_config += config_class(**not_quant_kwargs)
-
         return nc_config
 
     def int4_quant_algo(self):
diff --git a/onnx_neural_compressor/quantization/quant_utils.py b/onnx_neural_compressor/quantization/quant_utils.py
index 2d5518857..348fa8cdb 100644
--- a/onnx_neural_compressor/quantization/quant_utils.py
+++ b/onnx_neural_compressor/quantization/quant_utils.py
@@ -25,6 +25,8 @@ class QuantType(enum.Enum):  # pragma: no cover
 
     QInt8 = 0
     QUInt8 = 1
+    QInt4 = 4
+    QUInt4 = 5
 
     @property
     def tensor_type(self):
@@ -32,6 +34,10 @@ def tensor_type(self):
             return onnx.TensorProto.INT8
         if self == QuantType.QUInt8:
             return onnx.TensorProto.UINT8
+        if self == QuantType.QInt8:
+            return onnx.TensorProto.INT4
+        if self == QuantType.QUInt4:
+            return onnx.TensorProto.UINT4
         raise ValueError(f"Unexpected value qtype={self!r}.")
 
 
diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py
index b7def741b..38b1491af 100644
--- a/test/quantization/weight_only/test_awq.py
+++ b/test/quantization/weight_only/test_awq.py
@@ -10,8 +10,10 @@
 import torch
 import transformers
 from optimum.exporters.onnx import main_export
+from packaging import version
 
 from onnx_neural_compressor import data_reader, logger
+from onnx_neural_compressor.quantization import QuantFormat
 from onnx_neural_compressor.quantization import algorithm_entry as algos
 from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
@@ -191,13 +193,12 @@ def test_quantize_awq_from_class_beginner(self):
 
     def test_quantize_awq_fallback(self):
 
-        fp32_config = config.AWQConfig(weight_dtype="fp32")
         quant_config = config.AWQConfig(
             weight_dtype="int",
             weight_sym=False,
             weight_group_size=32,
+            nodes_to_exclude=["/h.4/mlp/fc_out/MatMul"],
         )
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config)
         qmodel = self._apply_awq(quant_config)
         self.assertIsNotNone(qmodel)
         self.assertEqual(self._count_woq_matmul(qmodel), 29)
@@ -215,6 +216,28 @@ def test_quantize_awq_fallback(self):
         self.assertEqual(self._count_woq_matmul(qmodel), 29)
         self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
 
+    @unittest.skipIf(
+        version.Version(ort.__version__) < version.Version("1.19.0"),
+        "Please use onnxruntime >= 1.19.0 for QDQ format test",
+    )
+    def test_awq_with_QDQ_format(self):
+
+        quant_config = config.AWQConfig(
+            weight_dtype="int",
+            weight_sym=False,
+            weight_group_size=32,
+            weight_bits=4,
+            quant_format=QuantFormat.QDQ,
+        )
+
+        op21_model = copy.deepcopy(self.matmul_model)
+        op21_model.opset_import[0].version = 21
+        qmodel = algos.awq_quantize_entry(op21_model, quant_config, calibration_data_reader=self.matmul_data_reader)
+
+        self.assertIsNotNone(qmodel)
+        self.assertTrue("MatMul" in [i.op_type for i in qmodel.graph.node])
+        self.assertTrue("DequantizeLinear" in [i.op_type for i in qmodel.graph.node])
+
 
 class TestAWQQuantWithORTLikeAPI(TestAWQQuant):
 
@@ -326,6 +349,32 @@ def test_awq_with_specified_matmul(self):
         self.assertIsNotNone(quant.model)
         self.assertEqual(self._count_woq_matmul(quant.model, bits=4, group_size=32), 1)
 
+    @unittest.skipIf(
+        version.Version(ort.__version__) < version.Version("1.19.0"),
+        "Please use onnxruntime >= 1.19.0 for QDQ format test",
+    )
+    def test_awq_with_QDQ_format(self):
+
+        algo_config = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig(
+            calibration_data_reader=self.matmul_data_reader, quant_format=QuantFormat.QDQ
+        )
+
+        op21_model = copy.deepcopy(self.matmul_model)
+        op21_model.opset_import[0].version = 21
+
+        quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
+            op21_model,
+            n_bits=4,
+            block_size=32,
+            is_symmetric=False,
+            algo_config=algo_config,
+            optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
+        )
+        quant.process()
+        self.assertIsNotNone(quant.model)
+        self.assertTrue("MatMul" in [i.op_type for i in quant.model.graph.node])
+        self.assertTrue("DequantizeLinear" in [i.op_type for i in quant.model.graph.node])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py
index 7902371c7..d4ecf547f 100644
--- a/test/quantization/weight_only/test_gptq.py
+++ b/test/quantization/weight_only/test_gptq.py
@@ -10,8 +10,10 @@
 import torch
 import transformers
 from optimum.exporters.onnx import main_export
+from packaging import version
 
 from onnx_neural_compressor import data_reader, logger
+from onnx_neural_compressor.quantization import QuantFormat
 from onnx_neural_compressor.quantization import algorithm_entry as algos
 from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
@@ -188,14 +190,13 @@ def test_quantize_gptq_from_class_beginner(self):
         self.assertIsNotNone(qmodel)
 
     def test_quantize_gptq_fallback(self):
-        fp32_config = config.GPTQConfig(weight_dtype="fp32")
         quant_config = config.GPTQConfig(
             weight_bits=4,
             weight_dtype="int",
             weight_sym=False,
             weight_group_size=32,
+            nodes_to_exclude=["/h.4/mlp/fc_out/MatMul"],
         )
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config)
         qmodel = self._apply_gptq(quant_config)
         self.assertIsNotNone(qmodel)
         self.assertEqual(self._count_woq_matmul(qmodel), 29)
@@ -214,6 +215,26 @@ def test_quantize_gptq_fallback(self):
         self.assertEqual(self._count_woq_matmul(qmodel), 29)
         self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
 
+    @unittest.skipIf(
+        version.Version(ort.__version__) < version.Version("1.19.0"),
+        "Please use onnxruntime >= 1.19.0 for QDQ format test",
+    )
+    def test_gptq_with_QDQ_format(self):
+        quant_config = config.GPTQConfig(
+            weight_bits=4,
+            weight_dtype="int",
+            weight_sym=False,
+            weight_group_size=32,
+            quant_format=QuantFormat.QDQ,
+        )
+        op21_model = copy.deepcopy(self.matmul_model)
+        op21_model.opset_import[0].version = 21
+        qmodel = algos.gptq_quantize_entry(op21_model, quant_config, calibration_data_reader=self.matmul_data_reader)
+
+        self.assertIsNotNone(qmodel)
+        self.assertTrue("MatMul" in [i.op_type for i in qmodel.graph.node])
+        self.assertTrue("DequantizeLinear" in [i.op_type for i in qmodel.graph.node])
+
 
 class TestGPTQQuantWithORTLikeAPI(TestGPTQQuant):
 
@@ -323,6 +344,32 @@ def test_gptq_with_specified_matmul(self):
         self.assertIsNotNone(quant.model)
         self.assertEqual(self._count_woq_matmul(quant.model, bits=4, group_size=32), 1)
 
+    @unittest.skipIf(
+        version.Version(ort.__version__) < version.Version("1.19.0"),
+        "Please use onnxruntime >= 1.19.0 for QDQ format test",
+    )
+    def test_gptq_with_QDQ_format(self):
+
+        algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig(
+            calibration_data_reader=self.matmul_data_reader, quant_format=QuantFormat.QDQ
+        )
+
+        op21_model = copy.deepcopy(self.matmul_model)
+        op21_model.opset_import[0].version = 21
+
+        quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
+            op21_model,
+            n_bits=4,
+            block_size=32,
+            is_symmetric=False,
+            algo_config=algo_config,
+            optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
+        )
+        quant.process()
+        self.assertIsNotNone(quant.model)
+        self.assertTrue("MatMul" in [i.op_type for i in quant.model.graph.node])
+        self.assertTrue("DequantizeLinear" in [i.op_type for i in quant.model.graph.node])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py
index 6f7cea1b8..d294342db 100644
--- a/test/quantization/weight_only/test_rtn.py
+++ b/test/quantization/weight_only/test_rtn.py
@@ -8,8 +8,10 @@
 import onnx
 import onnxruntime as ort
 from optimum.exporters.onnx import main_export
+from packaging import version
 
 from onnx_neural_compressor import logger
+from onnx_neural_compressor.quantization import QuantFormat
 from onnx_neural_compressor.quantization import algorithm_entry as algos
 from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
@@ -138,14 +140,13 @@ def test_quantize_rtn_from_class_beginner(self):
 
     def test_quantize_rtn_fallback(self):
 
-        fp32_config = config.RTNConfig(weight_dtype="fp32")
         quant_config = config.RTNConfig(
             weight_bits=4,
             weight_dtype="int",
             weight_sym=False,
             weight_group_size=32,
+            nodes_to_exclude=["/h.4/mlp/fc_out/MatMul"],
         )
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config)
         qmodel = self._apply_rtn(quant_config)
         self.assertIsNotNone(qmodel)
         self.assertEqual(self._count_woq_matmul(qmodel), 29)
@@ -164,6 +165,23 @@ def test_quantize_rtn_fallback(self):
         self.assertEqual(self._count_woq_matmul(qmodel), 29)
         self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
 
+    @unittest.skipIf(
+        version.Version(ort.__version__) < version.Version("1.19.0"),
+        "Please use onnxruntime >= 1.19.0 for QDQ format test",
+    )
+    def test_rtn_with_QDQ_format(self):
+
+        quant_config = config.RTNConfig(
+            weight_bits=4, weight_dtype="int", weight_sym=False, weight_group_size=32, quant_format=QuantFormat.QDQ
+        )
+        op21_model = copy.deepcopy(self.matmul_model)
+        op21_model.opset_import[0].version = 21
+        qmodel = algos.rtn_quantize_entry(op21_model, quant_config)
+
+        self.assertIsNotNone(qmodel)
+        self.assertTrue("MatMul" in [i.op_type for i in qmodel.graph.node])
+        self.assertTrue("DequantizeLinear" in [i.op_type for i in qmodel.graph.node])
+
 
 class TestRTNQuantWithORTLikeAPI(TestRTNQuant):
 
@@ -267,6 +285,29 @@ def test_rtn_with_specified_matmul(self):
         self.assertIsNotNone(quant.model)
         self.assertEqual(self._count_woq_matmul(quant.model, bits=4, group_size=32), 1)
 
+    @unittest.skipIf(
+        version.Version(ort.__version__) < version.Version("1.19.0"),
+        "Please use onnxruntime >= 1.19.0 for QDQ format test",
+    )
+    def test_rtn_with_QDQ_format(self):
+
+        algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(quant_format=QuantFormat.QDQ)
+        op21_model = copy.deepcopy(self.matmul_model)
+        op21_model.opset_import[0].version = 21
+
+        quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
+            op21_model,
+            n_bits=4,
+            block_size=32,
+            is_symmetric=False,
+            algo_config=algo_config,
+            optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
+        )
+        quant.process()
+        self.assertIsNotNone(quant.model)
+        self.assertTrue("MatMul" in [i.op_type for i in quant.model.graph.node])
+        self.assertTrue("DequantizeLinear" in [i.op_type for i in quant.model.graph.node])
+
 
 if __name__ == "__main__":
     unittest.main()

From 7dcef733f4389c3748232f2370af6d5a468a5207 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Wed, 14 Aug 2024 17:34:50 +0800
Subject: [PATCH 05/17] Update README.md

Signed-off-by: Wang, Mengni <mengni.wang@intel.com>
---
 .../text_generation/quantization/weight_only/README.md           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
index 0173ad33f..99703a8d9 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
@@ -40,6 +40,7 @@ python prepare_model.py  --input_model="meta-llama/Llama-2-7b-hf" \
 ## 1. Quantization
 
 Set `algorithm=WOQ_TUNE` to tune weight-only quantization algorithm or specify algorithm to `RTN` or `GPTQ` or `AWQ`.
+
 `quant_format=QDQ` works only when:
 - onnxruntime >= 1.19.0
 - opset version of the model >= 21

From 13b69e3aeec4a91ca3748afae03a94690a519d32 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Wed, 14 Aug 2024 17:35:31 +0800
Subject: [PATCH 06/17] Update README.md

Signed-off-by: Wang, Mengni <mengni.wang@intel.com>
---
 .../text_generation/quantization/weight_only/README.md           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
index 99703a8d9..11d3ed27a 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
@@ -45,6 +45,7 @@ Set `algorithm=WOQ_TUNE` to tune weight-only quantization algorithm or specify a
 - onnxruntime >= 1.19.0
 - opset version of the model >= 21
 - quantized bits is in [4, 8]
+
 otherwise it will execute QOperator automatically.
 
 ```bash

From 04625d0652af3c34402de58103f53c8bec9add27 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Fri, 16 Aug 2024 01:33:42 -0700
Subject: [PATCH 07/17] simplify config and fix ut

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../quantization/weight_only/main.py          |   3 +-
 .../quantization/weight_only/run_benchmark.sh |  38 ++-
 .../quantization/weight_only/run_quant.sh     |  55 ++--
 onnx_neural_compressor/algorithms/utility.py  |  13 +-
 .../quantization/algorithm_entry.py           |   2 +-
 onnx_neural_compressor/quantization/config.py | 288 ++++--------------
 .../post_training_quant/test_operators.py     |   3 +
 test/quantization/test_algorithm_utility.py   |   4 +
 test/quantization/test_config.py              |  22 --
 test/utils/test_onnx_model.py                 |   3 +-
 10 files changed, 137 insertions(+), 294 deletions(-)

diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
index 742afc18c..db793ea49 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
@@ -74,7 +74,8 @@
 parser.add_argument(
     "--tasks",
     nargs="+",
-    default=[
+    default=["lambada_openai"],
+    choices=[
         "winogrande",
         "copa",
         "piqa",
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh
index 97754097b..fc8e60c87 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh
@@ -14,19 +14,19 @@ function init_params {
   do
     case $var in
       --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
+          input_model=$(echo "$var" |cut -f2 -d=)
       ;;
       --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
+          batch_size=$(echo "$var" |cut -f2 -d=)
       ;;
       --tokenizer=*)
-          tokenizer=$(echo $var |cut -f2 -d=)
+          tokenizer=$(echo "$var" |cut -f2 -d=)
       ;;
       --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
+          mode=$(echo "$var" |cut -f2 -d=)
       ;;
       --intra_op_num_threads=*)
-          intra_op_num_threads=$(echo $var |cut -f2 -d=)
+          intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
       ;;
     esac
   done
@@ -42,19 +42,27 @@ function run_benchmark {
         input_model=$(dirname "$input_model")
     fi
 
+    extra_cmd=""
+
     if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
-        extra_cmd="--trust_remote_code True"
+        extra_cmd=$extra_cmd"--trust_remote_code True "
+    fi
+
+    if [ "${batch_size}" ]; then
+	extra_cmd=$extra_cmd"--batch_size ${batch_size} "
+    fi
+    if [ "${tokenizer}" ]; then
+	extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
+    fi
+    if [ "${tasks}" ]; then
+	extra_cmd=$extra_cmd"--tasks ${tasks} "
+    fi
+    if [ "${intra_op_num_threads}" ]; then
+	extra_cmd=$extra_cmd"--intra_op_num_threads ${intra_op_num_threads} "
     fi
 
-    python main.py \
-      --model_path="${input_model}" \
-      --batch_size="${batch_size-1}" \
-      --tokenizer="${tokenizer-meta-llama/Llama-2-7b-hf}" \
-      --tasks="${tasks-lambada_openai}" \
-      --mode="${mode}" \
-      --intra_op_num_threads="${intra_op_num_threads-24}" \
-      --benchmark \
-      ${extra_cmd}
+    extra_cmd=$extra_cmd"--benchmark"
+    eval "python main.py --model_path ${input_model} --mode ${mode} ${extra_cmd}"
 
 }
 
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh
index 255611b22..1c8a84681 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh
@@ -12,25 +12,25 @@ function init_params {
   do
     case $var in
       --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
+          input_model=$(echo "$var" |cut -f2 -d=)
       ;;
       --output_model=*)
-          output_model=$(echo $var |cut -f2 -d=)
+          output_model=$(echo "$var" |cut -f2 -d=)
       ;;
       --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
+          batch_size=$(echo "$var" |cut -f2 -d=)
       ;;
       --dataset=*)
-          dataset=$(echo $var |cut -f2 -d=)
+          dataset=$(echo "$var" |cut -f2 -d=)
       ;;
       --tokenizer=*)
-          tokenizer=$(echo $var |cut -f2 -d=)
+          tokenizer=$(echo "$var" |cut -f2 -d=)
       ;;
       --algorithm=*)
-          algorithm=$(echo $var |cut -f2 -d=)
+          algorithm=$(echo "$var" |cut -f2 -d=)
       ;;
       --quant_format=*)
-          quant_format=$(echo $var |cut -f2 -d=)
+          quant_format=$(echo "$var" |cut -f2 -d=)
       ;;
     esac
   done
@@ -59,31 +59,42 @@ function run_tuning {
 	echo "Created directory $output_model"
     fi
 
+    extra_cmd=""
+
     if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
         nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul"
-        extra_cmd="--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True"
+        extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True "
     fi
     if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then
         nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul"
-        extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
+        extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
     fi
     if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then
         nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul"
-        extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
+        extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
+    fi
+
+    if [ "${tokenizer}" ]; then
+	extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
+    fi
+    if [ "${batch_size}" ]; then
+	extra_cmd=$extra_cmd"--batch_size ${batch_size} "
+    fi
+    if [ "${dataset}" ]; then
+	extra_cmd=$extra_cmd"--dataset ${dataset} "
+    fi
+    if [ "${algorithm}" ]; then
+	extra_cmd=$extra_cmd"--algorithm ${algorithm} "
+    fi
+    if [ "${tasks}" ]; then
+	extra_cmd=$extra_cmd"--tasks ${tasks} "
+    fi
+    if [ "${quant_format}" ]; then
+	extra_cmd=$extra_cmd"--quant_format ${quant_format} "
     fi
 
-    python main.py \
-      --model_path "${input_model}" \
-      --tokenizer "${tokenizer-meta-llama/Llama-2-7b-hf}" \
-      --output_model "${output_model}" \
-      --batch_size "${batch_size-1}" \
-      --dataset "${dataset-NeelNanda/pile-10k}" \
-      --algorithm "${algorithm-WOQ_TUNE}" \
-      --tasks "${tasks-lambada_openai}" \
-      --quant_format "${quant_format-QOperator}" \
-      --layer_wise \
-      --tune \
-      ${extra_cmd}
+    extra_cmd=$extra_cmd"--layer_wise --tune"
+    eval "python main.py --model_path ${input_model} --output_model ${output_model} ${extra_cmd}"
 }
 
 main "$@"
diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index b38c8e21e..a2e5d01ec 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -222,14 +222,10 @@ def calculate_scale_zp(rmin, rmax, qType, sym, reduce_range=False):
     dtype = _qType_to_np_type(qType)
     if isinstance(rmax, np.ndarray):
         if sym:
-            mask = abs(rmin) > abs(rmax)
-            scale = np.ones(rmin.shape).astype(rmin.dtype)
-            scale[mask] = rmin[mask]
-            scale[~mask] = rmax[~mask]
-            abs_max = round((qmax - qmin) / 2)
-            scale /= abs_max
-        else:
-            scale = (rmax - rmin) / (qmax - qmin)
+            max_range = np.maximum(abs(rmin), abs(rmax))
+            rmin = -max_range
+            rmax = max_range
+        scale = (rmax - rmin) / (qmax - qmin)
         scale[abs(scale) < np.finfo(rmax.dtype).tiny] = 1
         zero_point = (
             np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype)
@@ -612,6 +608,7 @@ def dump_woq_stats(model, quantize_config):
 
         if optype not in res:
             res[optype] = {}
+
         if re.match("^.*_Q\d*G\d*", node.input[1]):
             Q_position = re.search("_Q\d*", node.input[1])
             full_position = re.search("_Q\d*G\d*", node.input[1])
diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index 560b14292..e58acf2f0 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -192,7 +192,7 @@ def smooth_quant_entry(
         calibration_data_reader,
         execution_provider=getattr(quant_config, "execution_provider", "CPUExecutionProvider"),
     )
-    smoothed_model = smoother.transform(**quant_config.to_dict())
+    smoothed_model = smoother.transform(**quant_config.get_model_params_dict())
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir:
         # ORT quant API requires str input
         onnx.save_model(
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index 6522d0522..1bfe9297d 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -201,6 +201,17 @@ class ExampleAlgorithmConfig:
     return config_registry.register_config_impl(algo_name=algo_name, priority=priority)
 
 
+class Encoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, quantization.QuantType):
+            return getattr(o, "tensor_type")
+        if isinstance(o, quantization.QuantFormat):
+            return getattr(o, "value")
+        if isinstance(o, quantization.CalibrationMethod):
+            return getattr(o, "name")
+        return super().default(o)
+
+
 class BaseConfig(ABC):
     """The base config for all algorithm configs."""
 
@@ -210,22 +221,23 @@ class BaseConfig(ABC):
 
     def __init__(
         self,
-        white_list: Optional[Union[Union[str, Callable], List[Union[str, Callable]]]] = constants.DEFAULT_WHITE_LIST,
+        white_list: Optional[List[str]] = constants.DEFAULT_WHITE_LIST,
     ) -> None:
         self._global_config: Optional[BaseConfig] = None
         # local config is the collections of operator_type configs and operator configs
         self._local_config: Dict[str, Optional[BaseConfig]] = {}
         self._white_list = white_list
         self._config_mapping = OrderedDict()
+        self._post_init()
 
     def _post_init(self):
         if self.white_list == constants.DEFAULT_WHITE_LIST:
             global_config = self.get_init_args()
-            self._global_config = self.__class__(**global_config, white_list=None)
+            self._global_config = self.__class__(**global_config, white_list=constants.EMPTY_WHITE_LIST)
         elif isinstance(self.white_list, list) and len(self.white_list) > 0:
             for op_name_or_type in self.white_list:
                 global_config = self.get_init_args()
-                tmp_config = self.__class__(**global_config, white_list=None)
+                tmp_config = self.__class__(**global_config, white_list=constants.EMPTY_WHITE_LIST)
                 self.set_local(op_name_or_type, tmp_config)
         elif self.white_list == constants.EMPTY_WHITE_LIST:
             return
@@ -296,6 +308,19 @@ def get_init_args(self):
                 result[param] = value
         return result
 
+    @staticmethod
+    def get_model_info(model) -> list:
+        """Get (node_name, optype) pairs of the model."""
+        if not isinstance(model, onnx.ModelProto):
+            model = onnx.load(model, load_external_data=False)
+
+        ops = []
+        for node in model.graph.node:
+            pair = (node.name, node.op_type)
+            ops.append(pair)
+        logger.debug(f"Get model info: {ops}")
+        return ops
+
     def __getitem__(self, key):
         if hasattr(self, key):
             return getattr(self, key)
@@ -323,7 +348,7 @@ def from_dict(cls, config_dict):
             operator_config = config_dict.get(constants.LOCAL, {})
             if operator_config:
                 for op_name, op_config in operator_config.items():
-                    config.set_local(op_name, cls(**op_config, white_list=None))
+                    config.set_local(op_name, cls(**op_config, white_list=constants.EMPTY_WHITE_LIST))
             return config
 
     def get_diff_dict(self, config) -> Dict[str, Any]:
@@ -348,7 +373,7 @@ def from_json_file(cls, filename):
     def to_json_file(self, filename):
         config_dict = self.to_dict()
         with open(filename, "w", encoding="utf-8") as file:
-            json.dump(config_dict, file, indent=4)
+            json.dump(config_dict, file, indent=4, cls=Encoder)
         logger.info("Dump the config into %s.", filename)
 
     def to_json_string(self, use_diff: bool = False) -> Union[str, Dict]:
@@ -367,7 +392,7 @@ def to_json_string(self, use_diff: bool = False) -> Union[str, Dict]:
         else:
             config_dict = self.to_dict()
         try:
-            return json.dumps(config_dict, indent=2) + "\n"
+            return json.dumps(config_dict, indent=2, cls=Encoder) + "\n"
         except Exception as e:
             logger.error("Failed to serialize the config to JSON string: %s", e)
             return config_dict
@@ -597,7 +622,7 @@ def from_dict(cls, config_dict: OrderedDict[str, Dict], config_registry: Dict[st
         return config
 
     def to_json_string(self, use_diff: bool = False) -> str:
-        return json.dumps(self.to_dict(), indent=2) + "\n"
+        return json.dumps(self.to_dict(), indent=2, cls=Encoder) + "\n"
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__} {self.to_json_string()}"
@@ -726,7 +751,7 @@ def __init__(
         quant_last_matmul: bool = True,
         quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
         nodes_to_exclude: list = [],
-        white_list: List[Union[str, Callable]] = constants.EMPTY_WHITE_LIST,
+        white_list: List[Union[str, Callable]] = constants.DEFAULT_WHITE_LIST,
     ):
         """Initialize weight-only quantization config.
 
@@ -747,7 +772,6 @@ def __init__(
             white_list (list, optional): op in white_list will be applied current config.
                 Defaults to constants.DEFAULT_WHITE_LIST.
         """
-        super().__init__(white_list=white_list)
         self.weight_bits = weight_bits
         self.weight_dtype = weight_dtype
         self.weight_group_size = weight_group_size
@@ -758,6 +782,7 @@ def __init__(
         self.quant_last_matmul = quant_last_matmul
         self.quant_format = quant_format
         self.nodes_to_exclude = nodes_to_exclude
+        super().__init__(white_list=white_list)
 
     def get_model_params_dict(self):
         result = dict()
@@ -779,6 +804,8 @@ def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: li
             for op_name, op_type in model_info:
                 if op_name in self.nodes_to_exclude:
                     continue
+                if op_type not in self.white_list:
+                    continue
                 if op_type == "MatMul":
                     last_matmul = op_name
                 if global_config is not None:
@@ -832,8 +859,8 @@ def __init__(
         layer_wise_quant: bool = False,
         quant_last_matmul: bool = True,
         quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
-        nodes_to_exclude: list = [],
-        white_list: List[Union[str, Callable]] = constants.RTN_OP_LIST,
+        nodes_to_exclude: List[str] = [],
+        white_list: List[str] = constants.RTN_OP_LIST,
     ):
         """Init RTN weight-only quantization config.
 
@@ -856,8 +883,10 @@ def __init__(
             quant_format (QuantFormat, optional): use QOperator or QDQ format, default is QOperator.
             nodes_to_exclude (list, optional): nodes in nodes_to_exclude list will be skipped during quantization.
             white_list (list, optional): op in white_list will be applied current config.
-                Defaults to constants.DEFAULT_WHITE_LIST.
         """
+        self.layer_wise_quant = layer_wise_quant
+        self.ratios = ratios
+
         super().__init__(
             weight_bits=weight_bits,
             weight_dtype=weight_dtype,
@@ -869,23 +898,9 @@ def __init__(
             quant_last_matmul=quant_last_matmul,
             quant_format=quant_format,
             nodes_to_exclude=nodes_to_exclude,
-            white_list=white_list,
+            white_list=white_list if white_list != constants.RTN_OP_LIST else constants.DEFAULT_WHITE_LIST,
         )
-        self.layer_wise_quant = layer_wise_quant
-        self.ratios = ratios
-        self._post_init()
-
-    def _post_init(self):
-        if self.white_list == constants.RTN_OP_LIST:
-            global_config = self.get_init_args()
-            self._global_config = self.__class__(**global_config, white_list=None)
-        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
-            for op_name_or_type in self.white_list:
-                global_config = self.get_init_args()
-                tmp_config = self.__class__(**global_config, white_list=None)
-                self.set_local(op_name_or_type, tmp_config)
-        elif self.white_list == constants.EMPTY_WHITE_LIST:
-            return
+        self.white_list = white_list
 
     @classmethod
     def register_supported_configs(cls) -> None:
@@ -901,19 +916,6 @@ def register_supported_configs(cls) -> None:
         supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators))
         cls.supported_configs = supported_configs
 
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.RTN_OP_LIST) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
     @classmethod
     def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]:  # pragma: no cover
         return RTNConfig(weight_bits=[4, 8], weight_sym=[True, False])
@@ -973,8 +975,8 @@ def __init__(
         layer_wise_quant: bool = False,
         quant_last_matmul: bool = True,
         quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
-        nodes_to_exclude: list = [],
-        white_list: List[Union[str, Callable]] = constants.GPTQ_OP_LIST,
+        nodes_to_exclude: List[str] = [],
+        white_list: List[str] = constants.GPTQ_OP_LIST,
     ):
         """Init GPTQ weight-only quantization config.
 
@@ -1003,8 +1005,14 @@ def __init__(
             quant_format (QuantFormat, optional): use QOperator or QDQ format, default is QOperator.
             nodes_to_exclude (list, optional): nodes in nodes_to_exclude list will be skipped during quantization.
             white_list (list, optional): op in white_list will be applied current config.
-                Defaults to constants.DEFAULT_WHITE_LIST.
         """
+        self.percdamp = percdamp
+        self.block_size = block_size
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+        self.layer_wise_quant = layer_wise_quant
+
         super().__init__(
             weight_bits=weight_bits,
             weight_dtype=weight_dtype,
@@ -1016,27 +1024,9 @@ def __init__(
             quant_last_matmul=quant_last_matmul,
             quant_format=quant_format,
             nodes_to_exclude=nodes_to_exclude,
-            white_list=white_list,
+            white_list=white_list if white_list != constants.GPTQ_OP_LIST else constants.DEFAULT_WHITE_LIST,
         )
-        self.percdamp = percdamp
-        self.block_size = block_size
-        self.actorder = actorder
-        self.mse = mse
-        self.perchannel = perchannel
-        self.layer_wise_quant = layer_wise_quant
-        self._post_init()
-
-    def _post_init(self):
-        if self.white_list == constants.GPTQ_OP_LIST:
-            global_config = self.get_init_args()
-            self._global_config = self.__class__(**global_config, white_list=None)
-        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
-            for op_name_or_type in self.white_list:
-                global_config = self.get_init_args()
-                tmp_config = self.__class__(**global_config, white_list=None)
-                self.set_local(op_name_or_type, tmp_config)
-        elif self.white_list == constants.EMPTY_WHITE_LIST:
-            return
+        self.white_list = white_list
 
     @classmethod
     def register_supported_configs(cls) -> None:
@@ -1055,19 +1045,6 @@ def register_supported_configs(cls) -> None:
         supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators))
         cls.supported_configs = supported_configs
 
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.GPTQ_OP_LIST) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
     @classmethod
     def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]:  # pragma: no cover
         return GPTQConfig(
@@ -1125,8 +1102,8 @@ def __init__(
         providers: List[str] = ["CPUExecutionProvider"],
         quant_last_matmul: bool = True,
         quant_format: quantization.QuantFormat = quantization.QuantFormat.QOperator,
-        nodes_to_exclude: list = [],
-        white_list: List[Union[str, Callable]] = constants.AWQ_OP_LIST,
+        nodes_to_exclude: List[str] = [],
+        white_list: List[str] = constants.AWQ_OP_LIST,
     ):
         """Init AWQ weight-only quantization config.
 
@@ -1148,8 +1125,10 @@ def __init__(
             quant_format (QuantFormat, optional): use QOperator or QDQ format, default is QOperator.
             nodes_to_exclude (list, optional): nodes in nodes_to_exclude list will be skipped during quantization.
             white_list (list, optional): op in white_list will be applied current config.
-                Defaults to constants.DEFAULT_WHITE_LIST.
         """
+        self.enable_auto_scale = enable_auto_scale
+        self.enable_mse_search = enable_mse_search
+
         super().__init__(
             weight_bits=weight_bits,
             weight_dtype=weight_dtype,
@@ -1161,23 +1140,9 @@ def __init__(
             quant_last_matmul=quant_last_matmul,
             quant_format=quant_format,
             nodes_to_exclude=nodes_to_exclude,
-            white_list=white_list,
+            white_list=white_list if white_list != constants.AWQ_OP_LIST else constants.DEFAULT_WHITE_LIST,
         )
-        self.enable_auto_scale = enable_auto_scale
-        self.enable_mse_search = enable_mse_search
-        self._post_init()
-
-    def _post_init(self):
-        if self.white_list == constants.AWQ_OP_LIST:
-            global_config = self.get_init_args()
-            self._global_config = self.__class__(**global_config, white_list=None)
-        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
-            for op_name_or_type in self.white_list:
-                global_config = self.get_init_args()
-                tmp_config = self.__class__(**global_config, white_list=None)
-                self.set_local(op_name_or_type, tmp_config)
-        elif self.white_list == constants.EMPTY_WHITE_LIST:
-            return
+        self.white_list = white_list
 
     @classmethod
     def register_supported_configs(cls) -> List[_OperatorConfig]:
@@ -1195,19 +1160,6 @@ def register_supported_configs(cls) -> List[_OperatorConfig]:
         supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators))
         cls.supported_configs = supported_configs
 
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, pathlib.Path, str], white_list=constants.AWQ_OP_LIST) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
     @classmethod
     def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]:  # pragma: no cover
         return AWQConfig(
@@ -1549,7 +1501,6 @@ def __init__(
         calibration_sampling_size=100,
         quant_last_matmul=True,
         execution_provider=None,
-        white_list: list = constants.DEFAULT_WHITE_LIST,
         **kwargs,
     ):
         """This is a class for static Quant Configuration.
@@ -1619,7 +1570,6 @@ def __init__(
         else:
             os.environ["ORT_TENSORRT_UNAVAILABLE"] = "1"
 
-        BaseConfig.__init__(self, white_list=self.op_types_to_quantize)
         self.execution_provider = execution_provider
         self.quant_last_matmul = quant_last_matmul
         self.calibration_sampling_size = calibration_sampling_size
@@ -1629,21 +1579,7 @@ def __init__(
         self.optypes_to_exclude_output_quant = _extra_options.OpTypesToExcludeOutputQuantization
         self.dedicated_qdq_pair = _extra_options.DedicatedQDQPair
         self.add_qdq_pair_to_weight = _extra_options.AddQDQPairToWeight
-        self.white_list = white_list
-        self._post_init()
-
-    @staticmethod
-    def get_model_info(model, white_list=constants.STATIC_QOPERATOR_CPU_OP_LIST) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
+        BaseConfig.__init__(self, white_list=self.op_types_to_quantize)
 
     def get_model_params_dict(self):
         result = dict()
@@ -1659,11 +1595,6 @@ def _post_init(self):
             for valid_func in STATIC_CHECK_FUNC_LIST:
                 op_config = valid_func(op_config, op_name_or_type, self.execution_provider, self.quant_format)
             self.set_local(op_name_or_type, op_config)
-        if isinstance(self.white_list, list) and len(self.white_list) > 0:
-            for op_name_or_type in self.white_list:
-                global_config = self.get_init_args()
-                tmp_config = self.__class__(**global_config, white_list=None)
-                self.set_local(op_name_or_type, tmp_config)
 
     def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
         if config_list is None:
@@ -1871,34 +1802,6 @@ def register_supported_configs(cls) -> None:
         )
         cls.supported_configs = supported_configs
 
-    def to_dict(self):
-        result = {}
-        for key, val in self.__dict__.items():
-            if key in ["_global_config", "_config_mapping"]:
-                continue
-            if key == "_local_config":
-                local_result = {}
-                for name, cfg in val.items():
-                    local_result[name] = cfg.to_dict()
-                result[key] = local_result
-                continue
-            if not isinstance(val, list):
-                result[key] = (
-                    getattr(val, "tensor_type", val)
-                    if isinstance(val, quantization.QuantType)
-                    else getattr(val, "value", val)
-                )
-            else:
-                result[key] = [
-                    (
-                        getattr(item, "tensor_type", item)
-                        if isinstance(item, quantization.QuantType)
-                        else getattr(item, "value", item)
-                    )
-                    for item in val
-                ]
-        return result
-
 
 ######################## SmoohQuant Config ###############################
 
@@ -1934,7 +1837,6 @@ def __init__(
         calib_iter: int = 100,
         scales_per_op: bool = True,
         auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"},
-        white_list: list = None,
         **kwargs,
     ):
         """Init smooth quant config.
@@ -1954,7 +1856,7 @@ def __init__(
             kwargs (dict): kwargs in below link are supported except calibration_data_reader:
                 https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/quantize.py#L78
         """
-        super().__init__(white_list=white_list, **kwargs)
+        super().__init__(**kwargs)
         self.alpha = alpha
         self.folding = folding
         self.op_types = op_types
@@ -1970,19 +1872,6 @@ def register_supported_configs(cls) -> List[_OperatorConfig]:
         supported_configs.append(_OperatorConfig(config=smooth_quant_config, operators=operators))
         cls.supported_configs = supported_configs
 
-    @staticmethod
-    def get_model_info(model, white_list=["Gemm", "Conv", "MatMul", "FusedConv"]) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
     @classmethod
     def get_config_set_for_tuning(
         cls,
@@ -2034,7 +1923,6 @@ def __init__(
         extra_options: dict = None,
         quant_last_matmul: bool = True,
         execution_provider: str = None,
-        white_list: list = constants.DEFAULT_WHITE_LIST,
         **kwargs,
     ):
         if execution_provider is None:
@@ -2056,28 +1944,13 @@ def __init__(
             use_external_data_format=use_external_data_format,
             extra_options=extra_options,
         )
-        BaseConfig.__init__(self, white_list=op_types_to_quantize)
         self.execution_provider = execution_provider
         self.quant_last_matmul = quant_last_matmul
         self.activation_type = quantization.QuantType.QUInt8
         _extra_options = ExtraOptions(**self.extra_options)
         self.weight_sym = _extra_options.WeightSymmetric
         self.activation_sym = _extra_options.ActivationSymmetric
-        self.white_list = white_list
-        self._post_init()
-
-    @staticmethod
-    def get_model_info(model, white_list=constants.DYNAMIC_CPU_OP_LIST) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
+        BaseConfig.__init__(self, white_list=op_types_to_quantize)
 
     def get_model_params_dict(self):
         result = dict()
@@ -2092,11 +1965,6 @@ def _post_init(self):
             for valid_func in DYNAMIC_CHECK_FUNC_LIST:
                 op_config = valid_func(op_config, op_name_or_type, self.execution_provider)
             self.set_local(op_name_or_type, op_config)
-        if isinstance(self.white_list, list) and len(self.white_list) > 0:
-            for op_name_or_type in self.white_list:
-                global_config = self.get_init_args()
-                tmp_config = self.__class__(**global_config, white_list=None)
-                self.set_local(op_name_or_type, tmp_config)
 
     def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
         if config_list is None:
@@ -2233,34 +2101,6 @@ def register_supported_configs(cls) -> None:
         )
         cls.supported_configs = supported_configs
 
-    def to_dict(self):
-        result = {}
-        for key, val in self.__dict__.items():
-            if key in ["_global_config", "_config_mapping"]:
-                continue
-            if key == "_local_config":
-                local_result = {}
-                for name, cfg in val.items():
-                    local_result[name] = cfg.to_dict()
-                result[key] = local_result
-                continue
-            if not isinstance(val, list):
-                result[key] = (
-                    getattr(val, "tensor_type", val)
-                    if isinstance(val, quantization.QuantType)
-                    else getattr(val, "value", val)
-                )
-            else:
-                result[key] = [
-                    (
-                        getattr(item, "tensor_type", item)
-                        if isinstance(item, quantization.QuantType)
-                        else getattr(item, "value", item)
-                    )
-                    for item in val
-                ]
-        return result
-
 
 ##################### NC Algo Configs End ###################################
 
diff --git a/test/quantization/post_training_quant/test_operators.py b/test/quantization/post_training_quant/test_operators.py
index 45c189328..c06759f3c 100644
--- a/test/quantization/post_training_quant/test_operators.py
+++ b/test/quantization/post_training_quant/test_operators.py
@@ -78,6 +78,9 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree("./onnxrt_test", ignore_errors=True)
+        os.remove("int8.onnx")
+        os.remove("qdq.onnx")
+        os.remove("test.onnx")
 
     def qlinear_test(self, model, q_config, quantize_params, quantizable_op_types, **kwargs):
         quant = quantizer.StaticQuantizer(
diff --git a/test/quantization/test_algorithm_utility.py b/test/quantization/test_algorithm_utility.py
index 4301545c7..4ab8fc5db 100644
--- a/test/quantization/test_algorithm_utility.py
+++ b/test/quantization/test_algorithm_utility.py
@@ -40,3 +40,7 @@ def test_is_B_transposed(self):
             beta=0.35,
         )
         self.assertFalse(quant_utils.is_B_transposed(node))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py
index 81ccd245d..7a4d3ba7d 100644
--- a/test/quantization/test_config.py
+++ b/test/quantization/test_config.py
@@ -329,28 +329,6 @@ def test_static_custom_quant_config(self):
                 self.assertLess(idx, 2)
 
     def test_config_white_lst(self):
-        global_config = config.RTNConfig(weight_bits=4)
-        # set operator instance
-        fc_out_config = config.RTNConfig(weight_dtype="fp32", white_list=["/h.4/mlp/fc_out/MatMul"])
-        # get model and quantize
-        fp32_model = self.gptj
-        qmodel = algos.rtn_quantize_entry(fp32_model, quant_config=global_config + fc_out_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-    def test_config_white_lst2(self):
-        global_config = config.RTNConfig(weight_dtype="fp32")
-        # set operator instance
-        fc_out_config = config.RTNConfig(weight_bits=4, white_list=["/h.4/mlp/fc_out/MatMul"])
-        # get model and quantize
-        fp32_model = self.gptj
-        qmodel = algos.rtn_quantize_entry(fp32_model, quant_config=global_config + fc_out_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 1)
-        self.assertTrue(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-    def test_config_white_lst3(self):
 
         global_config = config.RTNConfig(weight_bits=4)
         # set operator instance
diff --git a/test/utils/test_onnx_model.py b/test/utils/test_onnx_model.py
index f27f64e1f..999b0985b 100644
--- a/test/utils/test_onnx_model.py
+++ b/test/utils/test_onnx_model.py
@@ -88,6 +88,7 @@ def tearDownClass(self):
         shutil.rmtree("./gptj", ignore_errors=True)
         shutil.rmtree("./large_model", ignore_errors=True)
         os.remove("matmul_add.onnx")
+        os.remove("model1.onnx")
 
     def setUp(self):
         # print the test name
@@ -102,7 +103,7 @@ def test_model_atrribute(self):
         # model_path
         self.assertEqual(model.model_path, self.matmul_add_model)
         # framework
-        self.assertEqual(model.framework(), "onnxruntime")
+        self.assertEqual(model.framework, "onnxruntime")
         # q_config
         quant_config = config.RTNConfig()
         model.q_config = quant_config

From 54b5388807df489d0a827f93d974bb628f888d6b Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Fri, 16 Aug 2024 01:48:10 -0700
Subject: [PATCH 08/17] fix bug

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 test/quantization/test_smooth_quant.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py
index 9ad53c148..b27b7f7f0 100644
--- a/test/quantization/test_smooth_quant.py
+++ b/test/quantization/test_smooth_quant.py
@@ -74,9 +74,10 @@ def tearDownClass(self):
         os.remove("Optimized_model.onnx")
 
     def test_sq_config(self):
+        model = onnx.load(self.gptj)
         sq_config = config.SmoothQuantConfig()
-        model_info = sq_config.get_model_info(model=onnx.load(self.gptj))
-        self.assertEqual(len(model_info), 40)
+        model_info = sq_config.get_model_info(model=model)
+        self.assertEqual(len(model_info), len(model.graph.node))
 
     def test_sq_from_class_beginner(self):
         self.data_reader.rewind()

From be59ac4d3879e73afa2116e4849fddce8215bebc Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Fri, 16 Aug 2024 02:46:07 -0700
Subject: [PATCH 09/17] improve ut coverage

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 test/quantization/test_algorithm_utility.py | 35 +++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/test/quantization/test_algorithm_utility.py b/test/quantization/test_algorithm_utility.py
index 4ab8fc5db..c908c0ac8 100644
--- a/test/quantization/test_algorithm_utility.py
+++ b/test/quantization/test_algorithm_utility.py
@@ -6,6 +6,7 @@
 import numpy as np
 import onnx
 
+from onnx_neural_compressor import onnx_model
 from onnx_neural_compressor.algorithms import utility as quant_utils
 
 
@@ -41,6 +42,40 @@ def test_is_B_transposed(self):
         )
         self.assertFalse(quant_utils.is_B_transposed(node))
 
+    def test_make_woq_dq_node(self):
+        node = onnx.helper.make_node("MatMul", ["input", "weight"], "output", name="Matmul")
+        with self.assertRaises(ValueError):
+            quant_utils.make_weight_only_dequant_node(
+                node=node,
+                weight_shape=(32, 32),
+                block_size=16,
+                num_bits=32,
+                dtype="int",
+                q_weight=np.random.randint(0, 10, size=(2, 32), dtype=np.uint8),
+                scale=np.random.random((2, 32)),
+                zero_point=np.zeros((2, 32)),
+            )
+
+    def test_split_shared_bias(self):
+        input = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 15, 15])
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 5, 11, 11])
+        bias_initializer = onnx.numpy_helper.from_array(np.random.random(5).astype(np.float32), name="bias")
+        conv1_weight_initializer = onnx.numpy_helper.from_array(
+            np.random.randint(-1, 2, [5, 3, 3, 3]).astype(np.float32), name="conv1_weight"
+        )
+        conv1_node = onnx.helper.make_node("Conv", ["add_out", "conv1_weight", "bias"], ["conv1_output"], name="conv1")
+        conv2_weight_initializer = onnx.numpy_helper.from_array(
+            np.random.randint(-1, 2, [5, 5, 3, 3]).astype(np.float32), name="conv2_weight"
+        )
+        conv2_node = onnx.helper.make_node("Conv", ["add_out", "conv2_weight", "bias"], ["conv2_output"], name="conv2")
+        initializers = [conv1_weight_initializer, conv2_weight_initializer, bias_initializer]
+        graph = onnx.helper.make_graph([conv1_node, conv2_node], "test", [input], [output], initializer=initializers)
+        model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+
+        update_model = quant_utils.split_shared_bias(onnx_model.ONNXModel(model))
+        split = any(["_nc_split_" in i.name for i in update_model.initializer()])
+        self.assertTrue(split)
+
 
 if __name__ == "__main__":
     unittest.main()

From 2ec554cbe80df0bd67b4e4dd4c459cba5594b45e Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Mon, 19 Aug 2024 15:33:03 +0800
Subject: [PATCH 10/17] add ut

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 test/quantization/test_algorithm_utility.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/quantization/test_algorithm_utility.py b/test/quantization/test_algorithm_utility.py
index c908c0ac8..5c899c828 100644
--- a/test/quantization/test_algorithm_utility.py
+++ b/test/quantization/test_algorithm_utility.py
@@ -76,6 +76,14 @@ def test_split_shared_bias(self):
         split = any(["_nc_split_" in i.name for i in update_model.initializer()])
         self.assertTrue(split)
 
+    def test_get_qmin_qmax_for_qType(self):
+        with self.assertRaises(ValueError):
+            quant_utils.get_qmin_qmax_for_qType(onnx.TensorProto.INT64)
+
+        qmin, qmax = quant_utils.get_qmin_qmax_for_qType(onnx.TensorProto.INT8, reduce_range=True)
+        self.assertEqual(qmin, -64)
+        self.assertEqual(qmax, 64)
+
 
 if __name__ == "__main__":
     unittest.main()

From d59fdca9efb78b2dd191c342255ee6a57ea10c85 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Tue, 20 Aug 2024 17:06:53 +0800
Subject: [PATCH 11/17] enhance dump func

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 onnx_neural_compressor/algorithms/utility.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index a2e5d01ec..f7d1cf99d 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -594,18 +594,19 @@ def pad_tensor(weight, group_size, k_blocks):
     return weight
 
 
-def dump_woq_stats(model, quantize_config):
+def dump_woq_stats(model, quantize_config, white_list=["MatMul"]):
     res = {}
 
     dtype_set = set()
     for node in model.graph.node:
-        if node.name.split("_Q")[0] not in quantize_config:
-            continue
         if node.op_type in ["MatMulFpQ4", "MatMulNBits"]:
             optype = "MatMul"
         else:
             optype = node.op_type
 
+        if optype not in white_list:
+            continue
+
         if optype not in res:
             res[optype] = {}
 

From 27eae66707216b6fb15e3b0d6fa3372a8538d93a Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Mon, 26 Aug 2024 14:42:21 +0800
Subject: [PATCH 12/17] fix config setting

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 onnx_neural_compressor/quantization/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index 1bfe9297d..d55fb81d7 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -802,10 +802,10 @@ def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: li
             global_config = config.get_params_dict()
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
-                if op_name in self.nodes_to_exclude:
-                    continue
                 if op_type not in self.white_list:
                     continue
+                if any([re.match(exclude_name, op_name) for exclude_name in self.nodes_to_exclude]):
+                    continue
                 if op_type == "MatMul":
                     last_matmul = op_name
                 if global_config is not None:

From b51d8ec3af985791b1e7570cd359558b8b8aace9 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Thu, 19 Sep 2024 14:56:27 +0800
Subject: [PATCH 13/17] fix acc issue and refine code

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 examples/.config/model_params_onnxrt.json     |  27 ++++
 .../quantization/weight_only/README.md        |   2 +-
 .../quantization/weight_only/main.py          |   2 +-
 onnx_neural_compressor/algorithms/utility.py  | 115 +++++++++++++++---
 .../algorithms/weight_only/awq.py             |   6 -
 .../algorithms/weight_only/gptq.py            | 101 +++------------
 .../algorithms/weight_only/rtn.py             | 111 +++--------------
 .../quantization/algorithm_entry.py           |  78 +++++-------
 onnx_neural_compressor/quantization/config.py |  65 ++++++++--
 onnx_neural_compressor/quantization/tuning.py |   3 +-
 test/quantization/test_config.py              |  42 +++----
 11 files changed, 266 insertions(+), 286 deletions(-)

diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
index 4ade34f75..45fafcb34 100644
--- a/examples/.config/model_params_onnxrt.json
+++ b/examples/.config/model_params_onnxrt.json
@@ -18,6 +18,15 @@
         "batch_size": 1,
         "algorithm": "RTN"
       },
+      "llama-2-7b-rtn-with-past-qdq": {
+        "model_name": "meta-llama/Llama-2-7b-hf",
+        "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
+        "main_script": "main.py",
+        "batch_size": 1,
+        "algorithm": "RTN"
+      },
       "llama-2-7b-awq": {
         "model_name": "meta-llama/Llama-2-7b-hf",
         "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
@@ -36,6 +45,15 @@
         "batch_size": 1,
         "algorithm": "AWQ"
       },
+      "llama-2-7b-awq-with-past-qdq": {
+        "model_name": "meta-llama/Llama-2-7b-hf",
+        "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
+        "main_script": "main.py",
+        "batch_size": 1,
+        "algorithm": "AWQ"
+      },
       "llama-2-7b-gptq": {
         "model_name": "meta-llama/Llama-2-7b-hf",
         "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
@@ -54,6 +72,15 @@
         "batch_size": 1,
         "algorithm": "GPTQ"
       },
+      "llama-2-7b-gptq-with-past-qdq": {
+        "model_name": "meta-llama/Llama-2-7b-hf",
+        "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
+        "main_script": "main.py",
+        "batch_size": 1,
+        "algorithm": "GPTQ"
+      },
       "llama-2-7b-woq_tune": {
         "model_name": "meta-llama/Llama-2-7b-hf",
         "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
index 11d3ed27a..6bbd8234f 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md
@@ -55,7 +55,7 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
                   --dataset=NeelNanda/pile-10k \
                   --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                   --algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ \
-                  --quant_format=QOperator # support QOperator and QDQ
+                  --quant_format=QDQ # support QOperator and QDQ
 ```
 
 ## 2. Benchmark
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
index db793ea49..196d7d4d1 100644
--- a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
+++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
@@ -106,7 +106,7 @@
     default=[],
     help="nodes that will not be quantized. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
 )
-parser.add_argument("--quant_format", type=str, default="QOperator", choices=["QOperator", "QDQ"])
+parser.add_argument("--quant_format", type=str, default="QDQ", choices=["QOperator", "QDQ"])
 args = parser.parse_args()
 
 if args.tune and not os.path.exists(args.output_model):
diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index f7d1cf99d..9104d0d4e 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -340,7 +340,7 @@ def make_weight_only_dequant_node(
     input_names = []
     kwargs = {"block_size": block_size, "axis": axis}
 
-    q_weight = q_weight.reshape((-1, weight_shape[-1])).T
+    q_weight = q_weight.reshape((weight_shape[-1], -1)).T
     if num_bits == 4:
         q_weight = ((q_weight[:, ::2] & 0xF | q_weight[:, 1::2] << 4) & 0xFF).astype("uint8")
 
@@ -536,6 +536,91 @@ def make_matmul_weight_only_node(
     return matmul_weight_only_node, new_inits
 
 
+def quant_matmul_weight_only(
+    node,
+    weight,
+    dtype,
+    num_bits,
+    sym,
+    group_size,
+    ratio=1,
+    quant_format=None,
+    accuracy_level=0,
+):
+    new_nodes = []
+    new_inits = []
+    remove_nodes = []
+
+    org_w_shape = weight.shape  # ic, oc
+    group_size = group_size if group_size != -1 else org_w_shape[0]
+    k_blocks = (org_w_shape[0] - 1) // group_size + 1
+    weight = pad_tensor(weight, group_size, k_blocks)
+
+    if quant_format == 1:
+        _, _, zp, scale, q_weight = quantize_data(
+            weight.T.reshape((-1, group_size)),
+            dtype + str(num_bits),
+            sym,
+            ratio=ratio,
+            axis=1,
+        )
+        dequant_node, inits = make_weight_only_dequant_node(
+            node=node,
+            weight_shape=org_w_shape,
+            num_bits=num_bits,
+            dtype=dtype,
+            q_weight=q_weight,
+            scale=scale.astype(weight.dtype),
+            axis=0,
+            block_size=group_size,
+            zero_point=zp,
+        )
+        new_nodes.append(dequant_node)
+        new_inits.extend(inits)
+    elif quant_format == 0:
+        _, _, zp, scale, q_weight = quantize_data(
+            weight.T.reshape((-1, group_size)),
+            dtype + str(num_bits),
+            sym,
+            ratio=ratio,
+            axis=1,
+        )
+        q_matmul_node, inits = make_matmul_weight_only_node(
+            node=node,
+            weight_shape=org_w_shape,
+            num_bits=num_bits,
+            group_size=group_size,
+            k_blocks=k_blocks,
+            q_weight=q_weight,
+            scale=scale.astype(weight.dtype),
+            zero_point=zp if not sym else None,
+            accuracy_level=accuracy_level,
+        )
+        new_nodes.append(q_matmul_node)
+        new_inits.extend(inits)
+        remove_nodes.append(node)
+    else:
+        q_weight = qdq_data(
+            weight.T.reshape((-1, group_size)),
+            dtype + str(num_bits),
+            sym,
+            ratio=ratio,
+            axis=1,
+        )
+        q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
+        q_weight = np.transpose(q_weight)
+        q_weight = q_weight[: org_w_shape[0], :].astype(weight.dtype)
+        q_weight_tensor = onnx.helper.make_tensor(
+            name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
+            data_type=onnx.helper.np_dtype_to_tensor_dtype(q_weight.dtype),
+            dims=weight.shape,
+            vals=q_weight.tobytes(),
+            raw=True,
+        )
+        node.input[1] = q_weight_tensor.name
+        new_inits.append(q_weight_tensor)
+    return new_nodes, new_inits, remove_nodes
+
 def prepare_inputs(model, data_reader, providers):
     """Prepare inputs for weight only quantization.
 
@@ -604,27 +689,25 @@ def dump_woq_stats(model, quantize_config, white_list=["MatMul"]):
         else:
             optype = node.op_type
 
-        if optype not in white_list:
+        if optype not in white_list and optype != "DequantizeLinear":
             continue
 
         if optype not in res:
             res[optype] = {}
 
-        if re.match("^.*_Q\d*G\d*", node.input[1]):
-            Q_position = re.search("_Q\d*", node.input[1])
-            full_position = re.search("_Q\d*G\d*", node.input[1])
-            dtype = "A32W{}G{}".format(
-                node.input[1][Q_position.start() + 2 : Q_position.end()],
-                node.input[1][Q_position.end() + 1 : full_position.end()],
-            )
-        else:
-            dtype = "FP32"
-        dtype_set.add(dtype)
+        dtype = "FP32"
+        for inp in node.input:
+            if re.match("^.*_Q\d*G\d*", inp):
+                Q_position = re.search("_Q\d*", inp)
+                full_position = re.search("_Q\d*G\d*", inp)
+                dtype = "A32W{}G{}".format(
+                    inp[Q_position.start() + 2 : Q_position.end()],
+                    inp[Q_position.end() + 1 : full_position.end()],
+                )
+                dtype_set.add(dtype)
+                break
 
-        if dtype in res[optype]:
-            res[optype][dtype] += 1
-        else:
-            res[optype][dtype] = 1
+        res[optype][dtype] = res[optype].get(dtype, 0) + 1
 
     dtype_list = list(dtype_set)
     for dtype in dtype_list:
diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py
index 33eea54c8..55a1b2801 100644
--- a/onnx_neural_compressor/algorithms/weight_only/awq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/awq.py
@@ -270,7 +270,6 @@ def awq_quantize(
     enable_auto_scale: bool = True,
     enable_mse_search: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
-    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Quant the model with Activation-aware Weight quantization(AWQ) method.
 
@@ -294,7 +293,6 @@ def awq_quantize(
         enable_mse_search (bool, optional): whether to search for the best clip range from range
             [0.91, 1.0, 0.01]. Defaults to True.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
-        quant_format (int, optional): use Qoperator or QDQ format. 0 means Qoperator, 1 means QDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -391,7 +389,6 @@ def awq_quantize(
         weight_config=weight_config,
         ratios=full_ratio,
         providers=providers,
-        quant_format=quant_format,
     )
     return model
 
@@ -403,7 +400,6 @@ def apply_awq_on_model(
     enable_auto_scale: bool = True,
     enable_mse_search: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
-    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Apply Activation-aware Weight quantization(AWQ) on onnx model.
 
@@ -411,7 +407,6 @@ def apply_awq_on_model(
         model (Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str]): nnx model.
         quant_config (dict): quantization config.
         calibration_data_reader (data_reader.CalibrationDataReader): data_reader for calibration.
-        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 meansQDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -421,7 +416,6 @@ def apply_awq_on_model(
         "enable_auto_scale": enable_auto_scale,
         "enable_mse_search": enable_mse_search,
         "providers": providers,
-        "quant_format": quant_format,
     }
     q_model = awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs)
     quant_utils.dump_woq_stats(q_model, quant_config)
diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
index a3c639bb1..6a6a068a6 100644
--- a/onnx_neural_compressor/algorithms/weight_only/gptq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -27,6 +27,7 @@
 from onnx_neural_compressor import constants, data_reader, onnx_model, utility
 from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.layer_wise import core
+from onnx_neural_compressor.algorithms.weight_only import rtn
 from onnx_neural_compressor.quantization import config
 
 from typing import List, Union  # isort: skip
@@ -187,7 +188,6 @@ def gptq_quantize(
     mse: bool = False,
     perchannel: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
-    quant_format: int = 0,
     return_modelproto: bool = True,
 ):
     """Quant the model with GPTQ method.
@@ -214,7 +214,6 @@ def gptq_quantize(
         mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
         perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
-        quant_format (int, optional): using QOperator or QDQ format. 0 means QOperator, 1 meansQDQ. Default is 0.
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
 
@@ -301,13 +300,15 @@ def gptq_quantize(
             weight,
             H,
         ) in zip(node_list, weights, Hs):
-            weight_dtype = weight_config[node.name].get("weight_dtype", "int")
             num_bits = weight_config[node.name].get("weight_bits", 4)
             group_size = weight_config[node.name].get("weight_group_size", 32)
             sym = weight_config[node.name].get("weight_sym", True)
+            dtype = weight_config[node.name].get("weight_dtype", "int")
             accuracy_level = weight_config[node.name].get("accuracy_level", 0)
-            group_size = group_size if group_size != -1 else weight.shape[0]
-            dtype = weight.dtype
+            quant_format = getattr(weight_config[node.name].get("quant_format", None), "value", None)
+
+            weight_tensor = model.get_initializer(node.input[1])
+            init_share_num = model.get_initializer_share_num(node.input[1])
 
             # weight -> quant -> dequant -> q_weight
             q_weight = _gptq(
@@ -322,86 +323,25 @@ def gptq_quantize(
                 mse=mse,
                 perchannel=perchannel,
             )
-
-            weight_tensor = model.get_initializer(node.input[1])
-            org_shape = weight.shape
-            init_share_num = model.get_initializer_share_num(node.input[1])
-
-            satisfy_MatMulNBits_condition = ort_version > constants.ONNXRT1161_VERSION and num_bits == 4
-            satisfy_MatMulFpQ4_condition = (
-                ort_version >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
+            new_nodes, new_inits, remove_nodes = quant_utils.quant_matmul_weight_only(
+                node=node,
+                weight=weight,
+                dtype=dtype,
+                num_bits=num_bits,
+                sym=sym,
+                group_size=group_size,
+                quant_format=quant_format,
+                accuracy_level=accuracy_level,
             )
-            if (
-                quant_format == 1  # QDQ format
-                and num_bits in [4, 8]
-                and ort_version >= constants.ONNXRT119_VERSION
-                and model.opset_import[0].version > 20
-            ):
-                _, _, zp, scale, q_weight = quant_utils.quantize_data(
-                    weight.T.reshape((-1, group_size)),
-                    weight_dtype + str(num_bits),
-                    sym,
-                    axis=1,
-                )
-                dequant_node, new_inits = quant_utils.make_weight_only_dequant_node(
-                    node=node,
-                    weight_shape=org_shape,
-                    num_bits=num_bits,
-                    dtype=weight_dtype,
-                    q_weight=q_weight,
-                    scale=scale.astype(weight.dtype),
-                    axis=0,
-                    block_size=group_size,
-                    zero_point=zp,
-                )
-                model.add_initializers(new_inits)
-                model.add_node(dequant_node)
-                node.name += "_Q"
-            elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
-                "CUDAExecutionProvider" not in providers
-                and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
-            ):
-                # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
-                # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
-                k_blocks = (org_shape[0] + group_size - 1) // group_size
-                q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks)
-                _, _, zp, scale, q_weight = quant_utils.quantize_data(
-                    q_weight.T.reshape((-1, group_size)),
-                    weight_dtype + str(num_bits),
-                    sym,
-                    axis=1,
-                )
-                q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node(
-                    node=node,
-                    weight_shape=org_shape,
-                    num_bits=num_bits,
-                    group_size=group_size,
-                    k_blocks=k_blocks,
-                    q_weight=q_weight,
-                    scale=scale.astype(dtype),
-                    zero_point=zp if not sym else None,
-                    accuracy_level=accuracy_level,
-                )
-
-                model.add_initializers(new_inits)
-                model.remove_node(node)
-                model.add_node(q_matmul_node)
-            else:
-                q_weight_tensor = onnx.helper.make_tensor(
-                    name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
-                    dims=q_weight.shape,
-                    vals=q_weight.astype(dtype).tobytes(),
-                    raw=True,
-                )
-                model.add_initializer(q_weight_tensor)
-                node.input[1] = q_weight_tensor.name
+            model.add_initializers(new_inits)
+            model.add_nodes(new_nodes)
+            model.remove_nodes(remove_nodes)
+
             if init_share_num == 1:
                 model.remove_initializer(weight_tensor)
 
     model.remove_tensors_from_outputs(output_names)
     model.model.graph.output.MergeFrom(org_output)
-
     model.topological_sort()
 
     # reload external data to prevent external data file path errors
@@ -427,7 +367,6 @@ def apply_gptq_on_model(
     perchannel: bool = True,
     providers: List[str] = ["CPUExecutionProvider"],
     layer_wise_quant: bool = False,
-    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Apply GPTQ on onnx model.
 
@@ -435,7 +374,6 @@ def apply_gptq_on_model(
         model (Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str]): onnx model.
         quant_config (dict): quantization config.
         calibration_data_reader (data_reader.CalibrationDataReader): data_reader for calibration.
-        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 meansQDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -448,7 +386,6 @@ def apply_gptq_on_model(
         "mse": mse,
         "perchannel": perchannel,
         "providers": providers,
-        "quant_format": quant_format,
     }
 
     if layer_wise_quant:
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index ef2105731..776dc0651 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -37,7 +37,6 @@ def rtn_quantize(
     weight_config: dict = {},
     ratios: dict = {},
     providers: List[str] = ["CPUExecutionProvider"],
-    quant_format: int = 0,
     return_modelproto: bool = True,
 ):
     """Quantize the model with round to nearst method.
@@ -58,7 +57,6 @@ def rtn_quantize(
             }. Defaults to {}.
         ratios (dict, optional): percentile of clip. Defaults to {}.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
-        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 means QDQ. Default is 0.
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
     Returns:
@@ -67,8 +65,8 @@ def rtn_quantize(
     if not isinstance(model, onnx_model.ONNXModel):
         model = onnx_model.ONNXModel(model)
     base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
-    new_nodes = []
-    remove_nodes = []
+    new_nodes_all = []
+    remove_nodes_all = []
     total_num = len([i for i in model.nodes() if i.op_type in ["MatMul"]])
     curr_id = 0
     for node in model.nodes():
@@ -94,99 +92,29 @@ def rtn_quantize(
             group_size = weight_config[node.name].get("weight_group_size", 32)
             sym = weight_config[node.name].get("weight_sym", True)
             accuracy_level = weight_config[node.name].get("accuracy_level", 0)
+            quant_format = getattr(weight_config[node.name].get("quant_format", None), "value", None)
 
-            org_w_shape = weight.shape  # ic, oc
-            group_size = group_size if group_size != -1 else org_w_shape[0]
-
-            k_blocks = (org_w_shape[0] - 1) // group_size + 1
             init_share_num = model.get_initializer_share_num(node.input[1])
 
-            weight = quant_utils.pad_tensor(weight, group_size, k_blocks)
-
-            satisfy_MatMulNBits_condition = ort_version > constants.ONNXRT1161_VERSION and num_bits == 4
-            satisfy_MatMulFpQ4_condition = (
-                ort_version >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
+            new_nodes, new_inits, remove_nodes = quant_utils.quant_matmul_weight_only(
+                node=node,
+                weight=weight,
+                dtype=dtype,
+                num_bits=num_bits,
+                sym=sym,
+                group_size=group_size,
+                ratio=ratios.get(node.input[1], 1),
+                quant_format=quant_format,
+                accuracy_level=accuracy_level,
             )
-            if (
-                quant_format == 1  # QDQ format
-                and num_bits in [4, 8]
-                and ort_version >= constants.ONNXRT119_VERSION
-                and model.opset_import[0].version > 20
-            ):
-                _, _, zp, scale, q_weight = quant_utils.quantize_data(
-                    weight.T.reshape((-1, group_size)),
-                    dtype + str(num_bits),
-                    sym,
-                    ratio=ratios.get(node.input[1], 1),
-                    axis=1,
-                )
-                dequant_node, new_inits = quant_utils.make_weight_only_dequant_node(
-                    node=node,
-                    weight_shape=org_w_shape,
-                    num_bits=num_bits,
-                    dtype=dtype,
-                    q_weight=q_weight,
-                    scale=scale.astype(weight.dtype),
-                    axis=0,
-                    block_size=group_size,
-                    zero_point=zp,
-                )
-                model.add_initializers(new_inits)
-                new_nodes.append(dequant_node)
-                node.name += "_Q"
-            elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
-                "CUDAExecutionProvider" not in providers
-                and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
-            ):  # pragma: no cover
-                # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
-                # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
-                _, _, zp, scale, q_weight = quant_utils.quantize_data(
-                    weight.T.reshape((-1, group_size)),
-                    dtype + str(num_bits),
-                    sym,
-                    ratio=ratios.get(node.input[1], 1),
-                    axis=1,
-                )
-                q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node(
-                    node=node,
-                    weight_shape=org_w_shape,
-                    num_bits=num_bits,
-                    group_size=group_size,
-                    k_blocks=k_blocks,
-                    q_weight=q_weight,
-                    scale=scale.astype(weight.dtype),
-                    zero_point=zp if not sym else None,
-                    accuracy_level=accuracy_level,
-                )
-
-                model.add_initializers(new_inits)
-                remove_nodes.append(node)
-                new_nodes.append(q_matmul_node)
-            else:  # fake quant
-                q_weight = quant_utils.qdq_data(
-                    weight.T.reshape((-1, group_size)),
-                    dtype + str(num_bits),
-                    sym,
-                    ratio=ratios.get(node.input[1], 1),
-                    axis=1,
-                )
-                q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
-                q_weight = np.transpose(q_weight)
-                q_weight = q_weight[: org_w_shape[0], :].astype(weight.dtype)
-                q_weight_tensor = onnx.helper.make_tensor(
-                    name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=onnx.helper.np_dtype_to_tensor_dtype(q_weight.dtype),
-                    dims=weight.shape,
-                    vals=q_weight.tobytes(),
-                    raw=True,
-                )
-                model.add_initializer(q_weight_tensor)
-                node.input[1] = q_weight_tensor.name
+            model.add_initializers(new_inits)
+            new_nodes_all.extend(new_nodes)
+            remove_nodes_all.extend(remove_nodes)
             if init_share_num == 1:
                 model.remove_initializer(weight_tensor)
 
-    model.add_nodes(new_nodes)
-    model.remove_nodes(remove_nodes)
+    model.add_nodes(new_nodes_all)
+    model.remove_nodes(remove_nodes_all)
     model.topological_sort()
 
     # reload external data to prevent external data file path errors
@@ -206,14 +134,12 @@ def apply_rtn_on_model(
     ratios: dict = {},
     providers: List[str] = ["CPUExecutionProvider"],
     layer_wise_quant: bool = False,
-    quant_format: int = 0,
 ) -> onnx.ModelProto:
     """Apply RTN on onnx model.
 
     Args:
         model (Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str]): onnx model.
         quant_config (dict): quantization config.
-        quant_format (int): using QOperator or QDQ format. 0 means QOperator, 1 means QDQ. Default is 0.
 
     Returns:
         onnx.ModelProto: quantized onnx model.
@@ -221,7 +147,6 @@ def apply_rtn_on_model(
     quant_kwargs = {
         "ratios": ratios,
         "providers": providers,
-        "quant_format": quant_format,
     }
 
     if layer_wise_quant:
diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index e58acf2f0..f8b67894e 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -18,12 +18,15 @@
 
 import onnx
 import onnxruntime as ort
+from packaging import version
 
 from onnx_neural_compressor import constants, data_reader, logger, utility
 from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer
 from onnx_neural_compressor.algorithms.smoother import core
 from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn
-from onnx_neural_compressor.quantization import config
+from onnx_neural_compressor.quantization import QuantFormat, config
+
+ort_version = version.Version(ort.__version__)
 
 
 ###################### RTN Algo Entry ##################################
@@ -32,17 +35,14 @@ def rtn_quantize_entry(
     model: Union[pathlib.Path, str], quant_config: config.RTNConfig, *args, **kwargs
 ) -> onnx.ModelProto:
     """The main entry to apply rtn quantization."""
-    if len(quant_config.config_mapping) == 0:
-        # map config to each op
-        model_info = config.RTNConfig.get_model_info(model=model)
-        config_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.debug(config_mapping)
-    else:
-        config_mapping = quant_config.config_mapping
-    quant_kwargs = {}
-    for key in config.RTNConfig.model_params_list:
-        val = getattr(quant_config, key)
-        quant_kwargs[key] = getattr(val, "value", val)
+    config_mapping = quant_config.to_config_mapping(model=model)
+
+    quant_kwargs = dict(
+        zip(
+            config.RTNConfig.model_params_list,
+            [getattr(quant_config, key, None) for key in config.RTNConfig.model_params_list],
+        )
+    )
     model = rtn.apply_rtn_on_model(model, config_mapping, **quant_kwargs)
     return model
 
@@ -62,18 +62,13 @@ def gptq_quantize_entry(
         calibration_data_reader, data_reader.CalibrationDataReader
     ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"
 
-    if len(quant_config.config_mapping) == 0:
-        # map config to each op
-        model_info = config.GPTQConfig.get_model_info(model=model)
-        config_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.debug(config_mapping)
-    else:
-        config_mapping = quant_config.config_mapping
-
-    quant_kwargs = {}
-    for key in config.GPTQConfig.model_params_list:
-        val = getattr(quant_config, key)
-        quant_kwargs[key] = getattr(val, "value", val)
+    config_mapping = quant_config.to_config_mapping(model=model)
+    quant_kwargs = dict(
+        zip(
+            config.GPTQConfig.model_params_list,
+            [getattr(quant_config, key, None) for key in config.RTNConfig.model_params_list],
+        )
+    )
 
     # regenerate to ensure data exists
     calibration_data_reader.rewind()
@@ -96,18 +91,13 @@ def awq_quantize_entry(
         calibration_data_reader, data_reader.CalibrationDataReader
     ), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"
 
-    if len(quant_config.config_mapping) == 0:
-        # map config to each op
-        model_info = config.AWQConfig.get_model_info(model=model)
-        config_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.debug(config_mapping)
-    else:
-        config_mapping = quant_config.config_mapping
-
-    quant_kwargs = {}
-    for key in config.AWQConfig.model_params_list:
-        val = getattr(quant_config, key)
-        quant_kwargs[key] = getattr(val, "value", val)
+    config_mapping = quant_config.to_config_mapping(model=model)
+    quant_kwargs = dict(
+        zip(
+            config.AWQConfig.model_params_list,
+            [getattr(quant_config, key, None) for key in config.RTNConfig.model_params_list],
+        )
+    )
 
     # regenerate to ensure data exists
     calibration_data_reader.rewind()
@@ -134,13 +124,7 @@ def static_quantize_entry(
         calibration_data_reader, data_reader.CalibrationDataReader
     ), "Please follow onnx_neural_compressor/quantization/calibrate.py to implement calibration_data_reader"
 
-    if len(quant_config.config_mapping) == 0:
-        # map config to each op
-        model_info = config.StaticQuantConfig.get_model_info(model=model)
-        config_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.debug(config_mapping)
-    else:
-        config_mapping = quant_config.config_mapping
+    config_mapping = quant_config.to_config_mapping(model=model)
 
     calibration_data_reader.rewind()
     augment = calibrate.ONNXRTAugment(
@@ -235,13 +219,7 @@ def dynamic_quantize_entry(
         logger.warning("No candidate op type to do quantization, exit.")
         exit(0)
 
-    if len(quant_config.config_mapping) == 0:
-        # map config to each op
-        model_info = config.DynamicQuantConfig.get_model_info(model=model)
-        config_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.debug(config_mapping)
-    else:
-        config_mapping = quant_config.config_mapping
+    config_mapping = quant_config.to_config_mapping(model=model)
 
     _quantizer = quantizer.DynamicQuantizer(
         model,
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index d55fb81d7..a808c4e7b 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -27,8 +27,10 @@
 
 import numpy as np
 import onnx
+import onnxruntime as ort
 import pydantic
 from onnxruntime import quantization as ort_quant
+from packaging import version
 from typing_extensions import Self
 
 from onnx_neural_compressor import constants, data_reader, logger, quantization, utility
@@ -36,6 +38,8 @@
 from collections import OrderedDict  # isort: skip
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias  # isort: skip
 
+ort_version = version.Version(ort.__version__)
+
 
 class ParamLevel(enum.Enum):
     OP_LEVEL = enum.auto()
@@ -559,10 +563,13 @@ def _get_op_name_op_type_config(self):
         return op_type_config_dict, op_name_config_dict
 
     def to_config_mapping(
-        self, config_list: Optional[List[BaseConfig]] = None, model_info: List[Tuple[str, str]] = None
+        self,
+        model: Union[onnx.ModelProto, str],
+        config_list: Optional[List[BaseConfig]] = None,
     ) -> OrderedDict[Tuple[str, str], OrderedDict[str, BaseConfig]]:
         if config_list is None:
             config_list = [self]
+        model_info = BaseConfig.get_model_info(model)
         for config in config_list:
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
@@ -628,8 +635,9 @@ def __repr__(self) -> str:
         return f"{self.__class__.__name__} {self.to_json_string()}"
 
     def to_config_mapping(
-        self, config_list: List[BaseConfig] = None, model_info: Dict[str, Any] = None
+        self, model: Union[onnx.ModelProto, str], config_list: List[BaseConfig] = None
     ) -> OrderedDict[str, BaseConfig]:
+        model_info = BaseConfig.get_model_info(model)
         for config in self.config_list:
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             single_config_model_info = model_info.get(config.name, None)
@@ -790,7 +798,11 @@ def get_model_params_dict(self):
             result[param] = getattr(self, param)
         return result
 
-    def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None):
+    def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: List[BaseConfig] = None):
+        if isinstance(model, str):
+            model = onnx.load(model, load_external_data=False)
+
+        model_info = BaseConfig.get_model_info(model)
         if config_list is None:
             config_list = [self]
         for config in config_list:
@@ -804,19 +816,46 @@ def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: li
             for op_name, op_type in model_info:
                 if op_type not in self.white_list:
                     continue
+
+                # skip excluded op
                 if any([re.match(exclude_name, op_name) for exclude_name in self.nodes_to_exclude]):
                     continue
+
                 if op_type == "MatMul":
                     last_matmul = op_name
+
                 if global_config is not None:
                     self._config_mapping[op_name] = global_config
+
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_type_config_dict[op_type]
+
                 for op_name_pattern in op_name_config_dict:
                     if re.match(op_name_pattern, op_name):
                         self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
+
+                # convert config to dict
                 if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
                     self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+
+                # update quant_format
+                if (
+                    ort_version < constants.ONNXRT119_VERSION
+                    or model.opset_import[0].version < 21
+                    or self._config_mapping[op_name].get("weight_bits", 4) not in [4, 8]
+                ):
+                    self._config_mapping[op_name].update({"quant_format": quantization.QuantFormat.QOperator})
+                if (
+                    self._config_mapping[op_name].get("weight_bits", 4) != 4
+                    or ort_version < constants.ONNXRT116_VERSION
+                    or (
+                        ort_version <= constants.ONNXRT1161_VERSION
+                        and self._config_mapping[op_name].get("weight_group_size", 32) != 32
+                    )
+                ):
+                    # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
+                    # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
+                    del self._config_mapping[op_name]["quant_format"]
         if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
             del self._config_mapping[last_matmul]
         return self._config_mapping
@@ -838,11 +877,11 @@ class RTNConfig(BaseWeightOnlyConfig):
         "act_dtype",
         "accuracy_level",
         "ratios",
+        "quant_format",
     ]
     model_params_list: List[str] = [
         "providers",
         "layer_wise_quant",
-        "quant_format",
     ]
     name: str = constants.RTN
 
@@ -945,6 +984,7 @@ class GPTQConfig(BaseWeightOnlyConfig):
         "weight_sym",
         "act_dtype",
         "accuracy_level",
+        "quant_format",
     ]
     model_params_list: List[Union[str, TuningParam]] = [
         "percdamp",
@@ -954,7 +994,6 @@ class GPTQConfig(BaseWeightOnlyConfig):
         "perchannel",
         "providers",
         "layer_wise_quant",
-        "quant_format",
     ]
     name: str = constants.GPTQ
 
@@ -1080,12 +1119,12 @@ class AWQConfig(BaseWeightOnlyConfig):
         "weight_sym",
         "act_dtype",
         "accuracy_level",
+        "quant_format",
     ]
     model_params_list: List[str] = [
         "enable_auto_scale",
         "enable_mse_search",
         "providers",
-        "quant_format",
     ]
     name: str = constants.AWQ
 
@@ -1596,7 +1635,12 @@ def _post_init(self):
                 op_config = valid_func(op_config, op_name_or_type, self.execution_provider, self.quant_format)
             self.set_local(op_name_or_type, op_config)
 
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
+    def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: List[BaseConfig] = None):
+        if isinstance(model, str):
+            model = onnx.load(model, load_external_data=False)
+
+        model_info = BaseConfig.get_model_info(model)
+ 
         if config_list is None:
             config_list = [self]
         for config in config_list:
@@ -1966,7 +2010,12 @@ def _post_init(self):
                 op_config = valid_func(op_config, op_name_or_type, self.execution_provider)
             self.set_local(op_name_or_type, op_config)
 
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
+    def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: List[BaseConfig] = None):
+        if isinstance(model, str):
+            model = onnx.load(model, load_external_data=False)
+
+        model_info = BaseConfig.get_model_info(model)
+ 
         if config_list is None:
             config_list = [self]
         for config in config_list:
diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py
index 385ac63c0..862c2f40f 100644
--- a/onnx_neural_compressor/quantization/tuning.py
+++ b/onnx_neural_compressor/quantization/tuning.py
@@ -529,8 +529,7 @@ def autotune(
     tuning_logger.tuning_start()
     for trial_index, quant_config in enumerate(config_loader):
         # check whether config_mapping is verified
-        model_info = quant_config.__class__.get_model_info(model=model_input)
-        config_mapping = quant_config.to_config_mapping(model_info=model_info)
+        config_mapping = quant_config.to_config_mapping(model=model_input)
         if tuning_monitor.need_skip(config_mapping):
             continue
 
diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py
index 7a4d3ba7d..7b33f9b9d 100644
--- a/test/quantization/test_config.py
+++ b/test/quantization/test_config.py
@@ -100,8 +100,7 @@ def test_dynamic_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 if idx == 0:
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx == 1:
@@ -125,8 +124,7 @@ def test_dynamic_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 self.assertTrue("add" not in configs_mapping)
                 self.assertTrue("add2" not in configs_mapping)
                 self.assertTrue("Matmul" not in configs_mapping)
@@ -143,8 +141,7 @@ def test_dynamic_custom_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 if idx == 0:
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx == 1:
@@ -161,8 +158,7 @@ def test_dynamic_custom_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 self.assertTrue("add" not in configs_mapping)
                 self.assertTrue("add2" not in configs_mapping)
                 self.assertTrue("Matmul" not in configs_mapping)
@@ -179,8 +175,7 @@ def test_static_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 if idx in [0, 4]:
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx in [1, 5]:
@@ -202,8 +197,7 @@ def test_static_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 self.assertTrue("add" not in configs_mapping)
                 self.assertTrue("add2" not in configs_mapping)
                 self.assertTrue("Matmul" not in configs_mapping)
@@ -218,8 +212,7 @@ def test_static_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 if "Matmul" in configs_mapping:
                     self.assertFalse(configs_mapping["Matmul"]["per_channel"])
                     self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], "MinMax")
@@ -236,8 +229,7 @@ def test_static_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 if idx in [0, 4]:
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx in [1, 5]:
@@ -262,8 +254,7 @@ def test_static_custom_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 if idx == 0:
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx == 1:
@@ -281,8 +272,7 @@ def test_static_custom_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 self.assertTrue("add" not in configs_mapping)
                 self.assertTrue("add2" not in configs_mapping)
                 self.assertTrue("Matmul" not in configs_mapping)
@@ -299,8 +289,7 @@ def test_static_custom_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 self.assertFalse(configs_mapping["Matmul"]["per_channel"])
                 self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax")
                 self.assertLess(idx, 4)
@@ -315,8 +304,7 @@ def test_static_custom_quant_config(self):
             )
             config_loader = tuning.ConfigLoader(config_set=tuning_config.config_set, sampler=tuning_config.sampler)
             for idx, quant_config in enumerate(config_loader):
-                model_info = quant_config.get_model_info(model=self.simple_onnx_model)
-                configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+                configs_mapping = quant_config.to_config_mapping(model=self.simple_onnx_model)
                 if idx == 0:
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx == 1:
@@ -338,7 +326,7 @@ def test_config_white_lst(self):
         fp32_model = self.gptj
         model_info = config.RTNConfig.get_model_info(fp32_model)
         logger.info(quant_config)
-        configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+        configs_mapping = quant_config.to_config_mapping(model=fp32_model)
         logger.info(configs_mapping)
         self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 8)
         self.assertTrue(configs_mapping["/h.4/mlp/fc_in/MatMul"]["weight_bits"] == 4)
@@ -411,14 +399,14 @@ def test_config_mapping(self):
         fp32_model = self.gptj
         model_info = config.RTNConfig.get_model_info(fp32_model)
         logger.info(quant_config)
-        configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+        configs_mapping = quant_config.to_config_mapping(model=fp32_model)
         logger.info(configs_mapping)
         self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 8)
         self.assertTrue(configs_mapping["/h.4/mlp/fc_in/MatMul"]["weight_bits"] == 4)
         # test regular matching
         fc_config = config.RTNConfig(weight_bits=3)
         quant_config.set_local("/h.[1-4]/mlp/fc_out/MatMul", fc_config)
-        configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+        configs_mapping = quant_config.to_config_mapping(model=fp32_model)
         logger.info(configs_mapping)
         self.assertTrue(configs_mapping["/h.4/mlp/fc_out/MatMul"]["weight_bits"] == 3)
         self.assertTrue(configs_mapping["/h.3/mlp/fc_out/MatMul"]["weight_bits"] == 3)

From d4550615bf0a4125498bf8b57f73251bf35f871f Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Thu, 19 Sep 2024 15:04:06 +0800
Subject: [PATCH 14/17] fix CI

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 onnx_neural_compressor/algorithms/utility.py  | 1 +
 onnx_neural_compressor/quantization/config.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index 9104d0d4e..7d080ad8f 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -621,6 +621,7 @@ def quant_matmul_weight_only(
         new_inits.append(q_weight_tensor)
     return new_nodes, new_inits, remove_nodes
 
+
 def prepare_inputs(model, data_reader, providers):
     """Prepare inputs for weight only quantization.
 
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index a808c4e7b..61d393dfb 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -1640,7 +1640,7 @@ def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: Lis
             model = onnx.load(model, load_external_data=False)
 
         model_info = BaseConfig.get_model_info(model)
- 
+
         if config_list is None:
             config_list = [self]
         for config in config_list:
@@ -2015,7 +2015,7 @@ def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: Lis
             model = onnx.load(model, load_external_data=False)
 
         model_info = BaseConfig.get_model_info(model)
- 
+
         if config_list is None:
             config_list = [self]
         for config in config_list:

From f343357972b84fa4f549206bbf0971ac6c7443c0 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Thu, 19 Sep 2024 16:27:52 +0800
Subject: [PATCH 15/17] fix ut

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../quantization/algorithm_entry.py                  | 12 ++++++------
 onnx_neural_compressor/quantization/config.py        |  2 +-
 test/quantization/weight_only/test_awq.py            |  1 +
 test/quantization/weight_only/test_gptq.py           |  1 +
 test/utils/test_general.py                           |  3 +--
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index f8b67894e..703e250e2 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -39,8 +39,8 @@ def rtn_quantize_entry(
 
     quant_kwargs = dict(
         zip(
-            config.RTNConfig.model_params_list,
-            [getattr(quant_config, key, None) for key in config.RTNConfig.model_params_list],
+            quant_config.model_params_list,
+            [getattr(quant_config, key, None) for key in quant_config.model_params_list],
         )
     )
     model = rtn.apply_rtn_on_model(model, config_mapping, **quant_kwargs)
@@ -65,8 +65,8 @@ def gptq_quantize_entry(
     config_mapping = quant_config.to_config_mapping(model=model)
     quant_kwargs = dict(
         zip(
-            config.GPTQConfig.model_params_list,
-            [getattr(quant_config, key, None) for key in config.RTNConfig.model_params_list],
+            quant_config.model_params_list,
+            [getattr(quant_config, key, None) for key in quant_config.model_params_list],
         )
     )
 
@@ -94,8 +94,8 @@ def awq_quantize_entry(
     config_mapping = quant_config.to_config_mapping(model=model)
     quant_kwargs = dict(
         zip(
-            config.AWQConfig.model_params_list,
-            [getattr(quant_config, key, None) for key in config.RTNConfig.model_params_list],
+            quant_config.model_params_list,
+            [getattr(quant_config, key, None) for key in quant_config.model_params_list],
         )
     )
 
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index 61d393dfb..4f92752cc 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -637,7 +637,7 @@ def __repr__(self) -> str:
     def to_config_mapping(
         self, model: Union[onnx.ModelProto, str], config_list: List[BaseConfig] = None
     ) -> OrderedDict[str, BaseConfig]:
-        model_info = BaseConfig.get_model_info(model)
+        model_info = self.get_model_info(model)
         for config in self.config_list:
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             single_config_model_info = model_info.get(config.name, None)
diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py
index 38b1491af..ec2d9d1ea 100644
--- a/test/quantization/weight_only/test_awq.py
+++ b/test/quantization/weight_only/test_awq.py
@@ -155,6 +155,7 @@ def test_awq_params_combination(self):
             "weight_sym": [True, False],
             "act_dtype": ["fp32"],
             "accuracy_level": [0],
+            "quant_format": [0, 1],
             "enable_auto_scale": [True, False],
             "enable_mse_search": [True, False],
         }
diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py
index d4ecf547f..789d5f684 100644
--- a/test/quantization/weight_only/test_gptq.py
+++ b/test/quantization/weight_only/test_gptq.py
@@ -153,6 +153,7 @@ def test_gptq_params_combination(self):
             "weight_sym": [True, False],
             "act_dtype": ["fp32"],
             "accuracy_level": [0],
+            "quant_format": [0, 1],
             "percdamp": [0.01],
             "blocksize": [128],
             "actorder": [True, False],
diff --git a/test/utils/test_general.py b/test/utils/test_general.py
index b07d73115..be73c8549 100644
--- a/test/utils/test_general.py
+++ b/test/utils/test_general.py
@@ -212,8 +212,7 @@ def test_mixed_two_algos(self):
         fake_config = FakeAlgoConfig(weight_bits=4, white_list=[OP1_NAME])
         fake1_config = FakeAlgoOneConfig(weight_bits=2, white_list=[OP2_NAME])
         mixed_config = fake_config + fake1_config
-        model_info = mixed_config.get_model_info(model)
-        config_mapping = mixed_config.to_config_mapping(model_info=model_info)
+        config_mapping = mixed_config.to_config_mapping(model=model)
         self.assertIn(OP1_NAME, config_mapping)
         self.assertIn(OP2_NAME, config_mapping)
 

From 745b09969d687425355ac10e76765d4a2e2fe313 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Thu, 19 Sep 2024 16:59:42 +0800
Subject: [PATCH 16/17] update code

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 onnx_neural_compressor/algorithms/weight_only/gptq.py | 3 ---
 onnx_neural_compressor/algorithms/weight_only/rtn.py  | 3 ---
 onnx_neural_compressor/quantization/config.py         | 8 ++++----
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
index 6a6a068a6..06d184b19 100644
--- a/onnx_neural_compressor/algorithms/weight_only/gptq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -22,7 +22,6 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from packaging.version import Version
 
 from onnx_neural_compressor import constants, data_reader, onnx_model, utility
 from onnx_neural_compressor.algorithms import utility as quant_utils
@@ -32,8 +31,6 @@
 
 from typing import List, Union  # isort: skip
 
-ort_version = Version(ort.__version__)
-
 
 def _gptq(
     W: np.array,
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index 776dc0651..72f061554 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -21,7 +21,6 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from packaging import version
 
 from onnx_neural_compressor import constants, onnx_model, utility
 from onnx_neural_compressor.algorithms import utility as quant_utils
@@ -29,8 +28,6 @@
 
 from typing import List, Union  # isort: skip
 
-ort_version = version.Version(ort.__version__)
-
 
 def rtn_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index 4f92752cc..db5a18e18 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -569,7 +569,7 @@ def to_config_mapping(
     ) -> OrderedDict[Tuple[str, str], OrderedDict[str, BaseConfig]]:
         if config_list is None:
             config_list = [self]
-        model_info = BaseConfig.get_model_info(model)
+        model_info = self.get_model_info(model)
         for config in config_list:
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
@@ -802,7 +802,7 @@ def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: Lis
         if isinstance(model, str):
             model = onnx.load(model, load_external_data=False)
 
-        model_info = BaseConfig.get_model_info(model)
+        model_info = self.get_model_info(model)
         if config_list is None:
             config_list = [self]
         for config in config_list:
@@ -1639,7 +1639,7 @@ def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: Lis
         if isinstance(model, str):
             model = onnx.load(model, load_external_data=False)
 
-        model_info = BaseConfig.get_model_info(model)
+        model_info = self.get_model_info(model)
 
         if config_list is None:
             config_list = [self]
@@ -2014,7 +2014,7 @@ def to_config_mapping(self, model: Union[onnx.ModelProto, str], config_list: Lis
         if isinstance(model, str):
             model = onnx.load(model, load_external_data=False)
 
-        model_info = BaseConfig.get_model_info(model)
+        model_info = self.get_model_info(model)
 
         if config_list is None:
             config_list = [self]

From 9add47eed1806132a1177bfb298061411686f711 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Fri, 20 Sep 2024 10:24:30 +0800
Subject: [PATCH 17/17] remove unused code and add ut

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 onnx_neural_compressor/onnx_model.py          | 10 ----------
 onnx_neural_compressor/quantization/config.py |  3 +++
 test/utils/test_general.py                    |  6 ++++++
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py
index 74ce14c55..efc9cf9c8 100644
--- a/onnx_neural_compressor/onnx_model.py
+++ b/onnx_neural_compressor/onnx_model.py
@@ -472,16 +472,6 @@ def replace_output_of_all_nodes(self, old_output_name, new_output_name, white_op
                 if node.op_type not in black_optype:
                     ONNXModel.replace_node_output(node, old_output_name, new_output_name)
 
-    def remove_duplicate_nodes(self):
-        """remove duplicate nodes"""
-        new_nodes = []
-        for node in self.nodes():
-            if node not in new_nodes:
-                new_nodes.append(node)
-        self.model.graph.ClearField("node")
-        self.model.graph.node.extend(new_nodes)
-        self.update()
-
     def remove_unused_nodes(self):
         """Remove unused nodes."""
         unused_nodes = []
diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py
index db5a18e18..bc761dcce 100644
--- a/onnx_neural_compressor/quantization/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -571,8 +571,11 @@ def to_config_mapping(
             config_list = [self]
         model_info = self.get_model_info(model)
         for config in config_list:
+            global_config = config.get_params_dict()
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
+                if global_config is not None:
+                    self._config_mapping[op_name] = global_config
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_name_config_dict[op_type]
                 for op_name_pattern in op_name_config_dict:
diff --git a/test/utils/test_general.py b/test/utils/test_general.py
index be73c8549..ee42c8714 100644
--- a/test/utils/test_general.py
+++ b/test/utils/test_general.py
@@ -197,6 +197,12 @@ def test_api(self):
             DEFAULT_WEIGHT_BITS,
         )
 
+        model = FakeModel()
+        fake_default_config.set_local("OP1_NAME", FakeAlgoConfig(weight_dtype="uint"))
+        config_mapping = fake_default_config.to_config_mapping(model)
+        self.assertEqual(config_mapping["OP1_NAME"]["weight_dtype"], "uint")
+        self.assertEqual(config_mapping["OP2_NAME"]["weight_dtype"], "int")
+
     def test_config_expand_complex_tunable_type(self):
         target_op_type_list_options = [["Conv", "Gemm"], ["Conv", "Matmul"]]
         configs = FakeAlgoConfig(target_op_type_list=target_op_type_list_options)