forked from netease-youdao/EMLL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQuant.h
254 lines (235 loc) · 13.6 KB
/
Quant.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
/*****************************************************************************/
/* Copyright YouDao, Inc. */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
/* You may obtain a copy of the License at */
/* */
/* http://www.apache.org/licenses/LICENSE-2.0 */
/* */
/* Unless required by applicable law or agreed to in writing, software */
/* distributed under the License is distributed on an "AS IS" BASIS, */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
/* See the License for the specific language governing permissions and */
/* limitations under the License. */
/*****************************************************************************/
#include <stdint.h>
#include <arm_neon.h>
#ifndef INCLUDE_ARM_QUANT_INTERFACE
#define INCLUDE_ARM_QUANT_INTERFACE
#ifdef __cplusplus
extern "C" {
#endif
/***********************************************************************
Function: bias_int32_t
Description: Perform bias operation on a 32-bit signed int matrix.
This function can be used in asymmetric quantitized GEMM.
Parameters: dst: the address of the matrix to apply bias on
bias_dim0: the bias value on every element
bias_dim1: the address of the input bias vector which
will be applied to the matrix along its
major dimension, i.e. when the element
can be indexed by x * dim1 + y, each element
is biased by bias_dim1[y]. No bias will be
performed with NULL pointer as input.
bias_dim1_scale: the scale to be applied on elements
of bias_dim1[] prior to the bias
operation
bias_dim2: the address of the input bias vector which
whill be applied to the matrix along its
minor dimension, i.e. when the element
can be indexed by x * dim1 + y, each element
is biased by bias_dim2[x]. No bias will be
performed with NULL pointer as input.
bias_dim2_scale: the scale to be applied on elements
of bias_dim2[] prior to the bias
operation
dim1: the length of the major dimension of input matrix
dim2: the length of the minor dimension of input matrix
***********************************************************************/
void bias_int32_t(int32_t *dst, int32_t bias_dim0,
const int32_t *bias_dim1, int32_t bias_dim1_scale,
const int32_t *bias_dim2, int32_t bias_dim2_scale,
uint32_t dim1, uint32_t dim2);
/***********************************************************************
Function: u8u32_sum
Description: Perform summing operation of cols/rows of the unsigned
8-bit int matrix. The sum of each col/row is an unsigned
32-bit integer.
Parameters: src: the address of input matrix.
dst: the address of output vector.
dim1: the length of major dimension of input matrix.
dim2: the length of minor dimension of input matrix.
(the major dimension is the vertical one for column-
major matrix, or the horizontal one for row-major
matrix)
direction: the direction of summing
0: sum along the minor dimension,
output_vector_size == dim1;
1: sum along the major dimension,
output_vector_size == dim2.
***********************************************************************/
void u8u32_sum(const uint8_t *src, uint32_t *dst,
uint32_t dim1, uint32_t dim2, uint8_t direction);
/***********************************************************************
Function: quantize_asymmetric_f32_u8
Description: Asymmetric quantization from fp32 to unsigned 8-bit int,
producing an 8-bit zero-point integer Z0, a fp32 scale S0
and quantitized unsigned 8-bit data Q1-Qn on the run.
For each quantitized element Qi, S0 * (Qi - Z0) can
approximate the original input (fp32) Fi.
Parameters: const float32_t *input: the address of the input fp32 array
uint8_t *output: the address of the output integer array
uint8_t *zero_point: the address to output Z0
float32_t *scale: the address to output S0
uint32_t size: the number of elements in the input
float32_t input_min, input_max:
the min and max of input float32_t numbers.
when input_min > input_max, the min and max
of input are reevaluated.
***********************************************************************/
void quantize_asymmetric_f32_u8(const float32_t *input, uint8_t *output,
uint8_t *zero_point, float32_t *scale, uint32_t size,
float32_t input_min, float32_t input_max);
/***********************************************************************
Function: quantize_symmetric_f32_s8
Description: symmetric quantization from fp32 to signed 8-bit int,
producing a fp32 scale S0 and quantitized 8-bit data
Q1-Qn on the run.
For each quantitized element Qi, S0 * Qi can
approximate the original input (fp32) Fi.
Parameters: const float32_t *input: the address of the input fp32 array
int8_t *output: the address of the output integer array
float32_t *scale: the address to output S0
uint32_t size: the number of elements in the input
float32_t input_min, input_max:
the min and max of input float32_t numbers.
when input_min > input_max, the min and max
of input are reevaluated.
***********************************************************************/
void quantize_symmetric_f32_s8(const float32_t *input, int8_t *output,
float32_t *scale, uint32_t size, float32_t input_min, float32_t input_max);
/***********************************************************************
Function: quantize_asymmetric_f32_u16
Description: Asymmetric quantization from fp32 to unsigned 16-bit int,
producing an 16-bit zero-point integer Z0, a fp32 scale S0
and quantitized unsigned 16-bit data Q1-Qn on the run.
This function does the same thing as
quantize_asymmetric_f32_u8 except the zero point and
outputs are 16-bit integers.
***********************************************************************/
void quantize_asymmetric_f32_u16(const float32_t *input, uint16_t *output,
uint16_t *zero_point, float32_t *scale, uint32_t size,
float32_t input_min, float32_t input_max);
/***********************************************************************
Function: quantize_symmetric_f32_s16
Description: symmetric quantization from fp32 to signed 16-bit int,
producing a fp32 scale S0 and quantitized 16-bit data
Q1-Qn on the run. This function does the same thing
as quantize_symmetric_f32_s8 except the outputs are
16-bit integers.
***********************************************************************/
void quantize_symmetric_f32_s16(const float32_t *input, int16_t *output,
float32_t *scale, uint32_t size, float32_t input_min, float32_t input_max);
/***********************************************************************
Function: dequantize_symmetric_f32_s32
Description: Convert 32-bit signed int values to fp32 ones with scaling.
Parameters: const int32_t *src: the address of the input integer array
float32_t *dst: the address of the output fp32 array
float32_t scale: the scaling factor on the input
uint32_t size: the number of elements in the input
***********************************************************************/
void dequantize_symmetric_f32_s32(const int32_t *src, float32_t *dst,
float32_t scale, uint32_t size);
/************************************************************************
Function: requantize_asymmetric_32to8
Description: asymmetric requantization from signed 32-bit int to
unsigned 8-bit int, which produces an 8-bit zero-point
integer Z0, updates the fp32 scale S0 and outputs
requantitized unsigned 8-bit data Q1-Qn on the run.
For each requantitized element Qi, S0 * (Qi - Z0) can
approximate the original dequantized value (fp32) Fi
of the corresponding 32-bit input.
Parameters: const int32_t *input: the address of the input int array
uint8_t *output: the address of the output integer array
float *scale: the address to update scaling factor S0
uint8_t *zero_point: the address to output Z0
uint32_t size: the number of elements in the input
int32_t input_min, input_max: the min and max value
of input int32 numbers. if input_min > input_max,
the min and max of the input integers are recalculated.
Note: The following function is near-equivalent to this sequence:
dequant_cvt_float_int32_t(input, temporal_array, *scale, size);
quant_unsym_float_uint8_t(temporal_array, output,
zero_point, scale, size);
************************************************************************/
void requantize_asymmetric_32to8(const int32_t *input, uint8_t *output,
float *scale, uint8_t *zero_point, uint32_t size,
int32_t input_min, int32_t input_max);
/************************************************************************
Function: requantize_symmetric_32to8
Description: symmetric requantization from signed 32-bit int to
signed 8-bit int, which updates the fp32 scale S0
and outputs requantitized signed 8-bit data Q1-Qn
on the run.
For each requantitized element Qi, S0 * Qi can
approximate the original dequantized value (fp32) Fi
of the corresponding 32-bit input.
Parameters: const int32_t *input: the address of the input int array
int8_t *output: the address of the output integer array
float *scale: the address to update scaling factor S0
uint32_t size: the number of elements in the input
int32_t input_min, input_max: the min and max value
of input int32 numbers. if input_min > input_max,
the min and max of the input integers are recalculated.
Note: The following function is near-equivalent to this sequence:
dequant_cvt_float_int32_t(input, temporal_array, *scale, size);
quant_sym_float_int8_t(temporal_array, output, scale, size);
************************************************************************/
void requantize_symmetric_32to8(const int32_t *input, int8_t *output,
float *scale, uint32_t size,
int32_t input_min, int32_t input_max);
/************************************************************************
* Function: requantize_asymmetric_32to16
* Description: asymmetric requantization from signed 32-bit int to
* unsigned 16-bit int, which does the same thing as
* requantize_asymmetric_32to8 except that the outputs
* and zero point are 16-bit integers
***********************************************************************/
void requantize_asymmetric_32to16(const int32_t *input, uint16_t *output,
float *scale, uint16_t *zero_point, uint32_t size,
int32_t input_min, int32_t input_max);
/************************************************************************
* Function: requantize_symmetric_32to16
* Description: symmetric requantization from signed 32-bit int to
* signed 16-bit int, which does the same thing as
* requantize_symmetric_32to8 except that the outputs
* are 16-bit integers
***********************************************************************/
void requantize_symmetric_32to16(const int32_t *input, int16_t *output,
float *scale, uint32_t size,
int32_t input_min, int32_t input_max);
/************************************************************************
* Function: requantize_asymmetric_16to8
* Description: asymmetric requantization from signed 16-bit int to
* unsigned 8-bit int, which does the same thing as
* requantize_asymmetric_32to8 except that the inputs
* are 16-bit integers
***********************************************************************/
void requantize_asymmetric_16to8(const int16_t *input, uint8_t *output,
float *scale, uint8_t *zero_point, uint32_t size,
int16_t input_min, int16_t input_max);
/************************************************************************
* Function: requantize_symmetric_16to8
* Description: symmetric requantization from signed 16-bit int to
* signed 8-bit int, which does the same thing as
* requantize_symmetric_32to8 except that the inputs
* are 16-bit integers
***********************************************************************/
void requantize_symmetric_16to8(const int16_t *input, int8_t *output,
float *scale, uint32_t size,
int16_t input_min, int16_t input_max);
#ifdef __cplusplus
}
#endif
#endif