forked from algorithm-archivists/algorithm-archive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfft.s
399 lines (389 loc) · 10.2 KB
/
fft.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
.intel_syntax noprefix
.section .rodata
two: .double 2.0
one: .double 1.0
two_pi: .double -6.28318530718
rand_max: .long 4290772992
.long 1105199103
fmt: .string "%g\n"
.section .text
.global main
.extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2
# rdi - array ptr
# rsi - array size
dft:
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
sub rsp, r13 # Make a double complex array
xor r14, r14 # Set index to 0
dft_loop_i:
cmp r14, r13 # Check if index is equal to array size
je dft_end_i
lea rax, [rsp + r14] # Set tmp array to zero at r14
mov QWORD PTR [rax], 0
mov QWORD PTR [rax + 8], 0
xor r15, r15 # Set second index to 0
dft_loop_j:
cmp r15, r13 # Check if the index is equal to array size
je dft_end_j
movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N
mov rax, r14
imul rax, r15
shr rax, 4
cvtsi2sdq xmm2, rax
mulsd xmm1, xmm2
cvtsi2sdq xmm2, r13
divsd xmm1, xmm2
pxor xmm0, xmm0 # Set xmm0 to 0
call cexp
lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N)
movsd xmm2, QWORD PTR [rax]
movsd xmm3, QWORD PTR [rax + 8]
call __muldc3
lea rax, [rsp + r14]
movsd xmm6, QWORD PTR [rax] # Sum to tmp array
movsd xmm7, QWORD PTR [rax + 8]
addsd xmm6, xmm0
addsd xmm7, xmm1
movsd QWORD PTR [rax], xmm6 # Save to tmp array
movsd QWORD PTR [rax + 8], xmm7
add r15, 16
jmp dft_loop_j
dft_end_j:
add r14, 16
jmp dft_loop_i
dft_end_i:
mov rdi, r12 # Move tmp array to array ptr
mov rsi, rsp
mov rdx, r13
call memcpy
add rsp, r13
pop r15
pop r14
pop r13
pop r12
pop rbx
ret
# rdi - array ptr
# rsi - array size
cooley_tukey:
cmp rsi, 16 # Check if size if greater then 1
jle cooley_tukey_return
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
mov r14, rsi # Save N / 2
shr r14, 1
sub rsp, r14 # Make a tmp array
xor r15, r15
mov rbx, r12
cooley_tukey_spliting:
cmp r15, r14
je cooley_tukey_split
lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array
movaps xmm0, XMMWORD PTR [rax + 16]
movaps xmm1, XMMWORD PTR [rax]
movaps XMMWORD PTR [rsp + r15], xmm0
movaps XMMWORD PTR [rbx], xmm1
add rbx, 16
add r15, 16
jmp cooley_tukey_spliting
cooley_tukey_split:
mov rax, rsp
lea rdi, [r12 + r13]
cooley_tukey_mov_data:
cmp rbx, rdi
je cooley_tukey_moved
movaps xmm0, XMMWORD PTR [rax]
movaps XMMWORD PTR [rbx], xmm0
add rbx, 16
add rax, 16
jmp cooley_tukey_mov_data
cooley_tukey_moved:
add rsp, r14
mov rdi, r12 # Makking a recursive call
mov rsi, r14
call cooley_tukey
lea rdi, [r12 + r14] # Makking a recursive call
mov rsi, r14
call cooley_tukey
lea rbx, [r12 + r14]
mov r14, rbx
mov r15, r12
cooley_tukey_loop:
cmp r15, rbx
je cooley_tukey_end
pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N)
movsd xmm1, two_pi
mov rax, r14
sub rax, rbx
cvtsi2sdq xmm2, rax
cvtsi2sdq xmm3, r13
divsd xmm2, xmm3
mulsd xmm1, xmm2
call cexp
movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2]
movq xmm3, QWORD PTR [r14 + 8]
call __muldc3
movq xmm2, QWORD PTR [r15]
movq xmm3, QWORD PTR [r15 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2]
movq QWORD PTR [r14 + 8], xmm3
movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i]
movq xmm1, QWORD PTR [r15 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
subsd xmm0, xmm2
subsd xmm1, xmm3
movq QWORD PTR [r15], xmm0
movq QWORD PTR [r15 + 8], xmm1
add r14, 16
add r15, 16
jmp cooley_tukey_loop
cooley_tukey_end:
pop r15
pop r14
pop r13
pop r12
pop rbx
cooley_tukey_return:
ret
# rdi - array ptr
# rsi - array size
bit_reverse:
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
shr r13, 4
xor r14, r14 # Loop through all entries
bit_reverse_entries:
cmp r14, r13
je bit_reverse_return
cvtsi2sdq xmm0, r13 # Calculating the number of bit in N
call log2
cvttsd2si rcx, xmm0
mov rdi, 1 # Calculating (1 << log2(N)) - 1
sal edi, cl
sub edi, 1
sub ecx, 1
mov rax, r14
mov r15, r14
bit_reverse_loop:
sar r15 # Check if r15 is 0
je bit_reverse_reversed
sal rax, 1 # Calculating (rax << 1) | (r15 & 1)
mov rsi, r15
and rsi, 1
or rax, rsi
sub ecx, 1 # Decrement bit count
jmp bit_reverse_loop
bit_reverse_reversed:
sal eax, cl # Calculate (rax << rcx) & (1 << bit count)
and rax, rdi
cmp rax, r14 # Check if rax is greater then r14
jle bit_reverse_no_swap # If so then swap entries
shl rax, 4 # Times index by 16 to get bytes to entry
shl r14, 4
movaps xmm0, XMMWORD PTR [r12 + rax]
movaps xmm1, XMMWORD PTR [r12 + r14]
movaps XMMWORD PTR [r12 + rax], xmm1
movaps XMMWORD PTR [r12 + r14], xmm0
shr r14, 4
bit_reverse_no_swap:
add r14, 1
jmp bit_reverse_entries
bit_reverse_return:
pop r15
pop r14
pop r13
pop r12
pop rbx
ret
# rdi - array ptr
# rsi - array size
iterative_cooley_tukey:
push r12
push r13
push r14
push r15
push rbx
sub rsp, 48
mov r12, rdi
mov r13, rsi
call bit_reverse # Bit reversing array
sar r13, 4 # Calculate log2(N)
cvtsi2sdq xmm0, r13
call log2
cvttsd2si rax, xmm0
mov QWORD PTR [rsp], rax # Save it to the stack
mov r14, 1
iter_ct_loop_i:
cmp r14, rax # Check if r14 is greater then log2(N)
jg iter_ct_end_i
movsd xmm0, two # Calculate stride = 2^(r14)
cvtsi2sdq xmm1, r14
call pow
cvttsd2si r10, xmm0
mov QWORD PTR [rsp + 40], r10# move stride to stack
movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride)
divsd xmm1, xmm0
pxor xmm0, xmm0
call cexp
movq QWORD PTR [rsp + 8], xmm0 # Save it to stack
movq QWORD PTR [rsp + 16], xmm1
xor r15, r15
iter_ct_loop_j:
cmp r15, r13 # Check if r15 is less then array size
je iter_ct_end_j
movsd xmm4, one # Save 1 + 0i to stack
pxor xmm5, xmm5
movsd QWORD PTR [rsp + 24], xmm4
movsd QWORD PTR [rsp + 32], xmm5
xor rbx, rbx
mov rax, QWORD PTR [rsp + 40]# Calculate stride / 2
sar rax, 1
iter_ct_loop_k:
cmp rbx, rax # Check if rbx is less then stride / 2
je iter_ct_end_k
mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j]
add r8, rbx
sal r8, 4
mov r9, QWORD PTR [rsp + 40]
sal r9, 3
add r9, r8
lea r9, [r12 + r9]
lea r8, [r12 + r8]
movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2]
movsd xmm1, QWORD PTR [r9 + 8]
movsd xmm2, QWORD PTR [rsp + 24]
movsd xmm3, QWORD PTR [rsp + 32]
call __muldc3
movsd xmm2, QWORD PTR [r8]
movsd xmm3, QWORD PTR [r8 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
movsd QWORD PTR [r9], xmm2 # Saving answer
movsd QWORD PTR [r9 + 8], xmm3
movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j])
movsd xmm1, QWORD PTR [r8 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
subsd xmm0, xmm2
subsd xmm1, xmm3
movsd QWORD PTR [r8], xmm0 # Saving answer
movsd QWORD PTR [r8 + 8], xmm1
movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w
movsd xmm1, QWORD PTR [rsp + 32]
movsd xmm2, QWORD PTR [rsp + 8]
movsd xmm3, QWORD PTR [rsp + 16]
call __muldc3
movsd QWORD PTR [rsp + 24], xmm0 # Saving answer
movsd QWORD PTR [rsp + 32], xmm1
add rbx, 1
mov rax, QWORD PTR [rsp + 40]
sar rax, 1
jmp iter_ct_loop_k
iter_ct_end_k:
add r15, QWORD PTR [rsp + 40]
jmp iter_ct_loop_j
iter_ct_end_j:
add r14, 1
mov rax, QWORD PTR [rsp]
jmp iter_ct_loop_i
iter_ct_end_i:
add rsp, 48
pop rbx
pop r15
pop r14
pop r13
pop r12
ret
# rdi - array a ptr
# rsi - array b ptr
# rdx - array size
approx:
push r12
push r13
push r14
push r15
mov r12, rdi
mov r13, rsi
mov r14, rdx
lea r15, [rdi + rdx]
sub rsp, 8
approx_loop:
cmp r12, r15
je approx_return
movsd xmm0, QWORD PTR[r13]
movsd xmm1, QWORD PTR[r13 + 8]
call cabs
movsd QWORD PTR [rsp], xmm0
movsd xmm0, QWORD PTR[r12]
movsd xmm1, QWORD PTR[r12 + 8]
call cabs
movsd xmm1, QWORD PTR [rsp]
subsd xmm0, xmm1
mov rdi, OFFSET fmt
mov rax, 1
call printf
add r12, 16
add r13, 16
jmp approx_loop
approx_return:
add rsp, 8
pop r15
pop r14
pop r13
pop r12
ret
main:
push r12
sub rsp, 2048
mov rdi, 0
call time
mov edi, eax
call srand
lea r12, [rsp + 1024]
loop:
cmp r12, rsp
je end_loop
sub r12, 16
call rand
cvtsi2sd xmm0, rax
divsd xmm0, rand_max
lea rax, [r12 + 1024]
movsd QWORD PTR [r12], xmm0
movsd QWORD PTR [rax], xmm0
mov QWORD PTR [r12 + 8], 0
mov QWORD PTR [rax + 8], 0
jmp loop
end_loop:
mov rdi, rsp
mov rsi, 1024
call iterative_cooley_tukey
lea rdi, [rsp + 1024]
mov rsi, 1024
call cooley_tukey
mov rdi, rsp
lea rsi, [rsp + 1024]
mov rdx, 1024
call approx
xor rax, rax
add rsp, 2048
pop r12
ret