microsoft · mszhanyi · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -9,6 +9,7 @@
 
 #include "core/common/common.h"
 #include "core/common/safeint.h"
+#include "core/framework/float16.h"
 #include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph.h"
@@ -267,9 +268,17 @@ std::unique_ptr<IndexedSubGraph::MetaDef> FuseActivation(const NodeUnit& node_un
             ORT_ENFORCE(utils::HasExternalData(value) == false,
                         "External data is not supported for the scalar min/max Clip values");
 
-            value_to_set = utils::HasRawData(value)
-                               ? *reinterpret_cast<const float*>(value.raw_data().data())
-                               : value.float_data()[0];
+            int32_t arg_type;
+            if (GetType(arg, arg_type) && arg_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
 if (initializer) { 
   Initializer i(*initializer, graph.ModelPath()); 
   switch (initializer->data_type()) { 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: 
       value = *i.data<float>(); 
       break; 
     // double isn't currently supported 
     // case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: 
     //  value = static_cast<float>(*i.data<double>()); 
     //  break; 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: 
       value = math::halfToFloat(i.data<MLFloat16>()->val); 
       break; 
     default: 
       ORT_THROW("Unexpected data type for Clip input of ", initializer->data_type()); 
 if (initializer) { 
   Initializer i(*initializer, graph.ModelPath()); 
   switch (initializer->data_type()) { 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: 
       value = *i.data<float>(); 
       break; 
     // double isn't currently supported 
     // case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: 
     //  value = static_cast<float>(*i.data<double>()); 
     //  break; 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: 
       value = math::halfToFloat(i.data<MLFloat16>()->val); 
       break; 
     default: 
       ORT_THROW("Unexpected data type for Clip input of ", initializer->data_type()); 
+              // arg is of type FP16
+              value_to_set = utils::HasRawData(value)
+                                 ? (*reinterpret_cast<const MLFloat16*>(value.raw_data().data())).ToFloat()
+                                 : value.float_data()[0];
+            } else {
+              value_to_set = utils::HasRawData(value)
+                                 ? *reinterpret_cast<const float*>(value.raw_data().data())
+                                 : value.float_data()[0];
+            }
           }
         }
       };

diff --git a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
@@ -6,9 +6,11 @@
 
 #include "core/common/logging/logging.h"
 #include "core/common/span_utils.h"
+#include "core/framework/float16.h"
 #include "core/framework/utils.h"
 #include "core/graph/graph.h"
 #include "core/providers/xnnpack/xnnpack_execution_provider.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 #include "core/session/inference_session.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -89,6 +91,51 @@ TEST(XnnpackEP, TestNhwcConvReluClipFusion) {
   RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion", std::move(ep), feeds, params);
 }
 
+#ifdef XNNPACK_FP16_SUPPORTED
+TEST(XnnpackEP, TestNhwcConvReluClipFusion_FP16) {
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_clip_relu_fp16.onnx";
+
+  RandomValueGenerator generator;
+  TensorShape input_shape_x{1, 16, 16, 192};
+  std::vector<MLFloat16> input_x = generator.Uniform<MLFloat16>(input_shape_x.GetDims(), -128, 128);
+
+  OrtValue ml_value_x;
+  CreateMLValue<MLFloat16>(input_shape_x.GetDims(), input_x.data(), OrtMemoryInfo(), &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("model_input", ml_value_x));
+
+  std::function<void(const Graph&)> verify = [](const Graph& graph) -> void {
+    ASSERT_EQ(graph.NumberOfNodes(), 3) << "Transpose nodes should have been removed, and "
+                                           "Conv+Relu and Conv+Clip should have been fused, leaving 3 nodes.";
+    auto node_iter = graph.Nodes().begin();
+    auto check_node = [](const Node& node, const std::string& fusion_type) {
+      const auto& attr = node.GetAttributes();
+      auto activation = attr.find("activation");
+      ASSERT_NE(activation, attr.cend()) << "Fused node should have activation attribute";
+      ASSERT_EQ(activation->second.s(), fusion_type);
+    };
+
+    // check 2nd and 3rd nodes.
+    // the first node is the Conv that does not get fused (created after first call to GetCapability)
+    // the 2nd and 3rd nodes are the fused nodes (created after second call to GetCapability)
+    ++node_iter;
+    check_node(*node_iter, "Clip");
+    ++node_iter;
+    check_node(*node_iter, "Relu");
+  };
+
+  EPVerificationParams params;
+  params.ep_node_assignment = ExpectedEPNodeAssignment::All;
+  params.fp32_abs_err = 0.0002f;
+  params.graph_verifier = &verify;
+
+  auto ep = DefaultXnnpackExecutionProvider();
+  // So far, CPU EP doensn't support Fp16 Conv fusion, so verify_outputs is skipped.
+  RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion_FP16", std::move(ep), feeds, params, {}, false);
-  // So far, CPU EP doensn't support Fp16 Conv fusion, so verify_outputs is skipped.
-  RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion_FP16", std::move(ep), feeds, params, {}, false);
+  // So far, CPU EP doesn't support Fp16 Conv fusion, so verify_outputs is skipped.
+  RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion_FP16", std::move(ep), feeds, params, {}, false);
 // TODO Add the following activations: 
 //    MlasTanhActivation, 
 //    MlasLogisticActivation, 
 //    MlasClipActivation, 
-  // So far, CPU EP doensn't support Fp16 Conv fusion, so verify_outputs is skipped.
-  RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion_FP16", std::move(ep), feeds, params, {}, false);
+  // So far, CPU EP doesn't support Fp16 Conv fusion, so verify_outputs is skipped.
+  RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion_FP16", std::move(ep), feeds, params, {}, false);
 // TODO Add the following activations: 
 //    MlasTanhActivation, 
 //    MlasLogisticActivation, 
 //    MlasClipActivation, 
+}
+#endif
+
 // test we can share the cpu ep allocator with the xnnpack EP
 TEST(XnnpackEP, TestAllocatorSharing) {
   auto init_session = [](std::vector<std::shared_ptr<IExecutionProvider>>& eps,

diff --git a/onnxruntime/test/testdata/nhwc_conv_clip_relu_fp16.onnx b/onnxruntime/test/testdata/nhwc_conv_clip_relu_fp16.onnx