1 /*
2 // 3dNow-optimized FFT written by Jean-Marc Valin.
3 // Inverse FFT derived by Mike DeSimone.
4 // Bit reverse algorithm by Mike DeSimone.
5 // Copyright 2001, 2002 Jean-Marc Valin and Mike DeSimone
6 //
7 // This software is released under version 2.1 of the GNU Lesser General
8 // Public License.
9 //
10 // I declared all complex arrays as float*. Internal format is:
11 // struct { float re; float im; }
12 // Use whatever similar structure fits your needs. I'm a C++ programmer,
13 // so I use complex<float>.
14 //
15 // Initialization:
16 // int* scrTable = CreateScrambleTable(bits);
17 // float* wTable = CreateCoefficientTable(bits);
18 // Forward FFT:
19 // Scramble(input, output, bits, scrTable);
20 // FFTRecursive(b, bits, wTable, 1, 0);
21 // Inverse FFT:
22 // Scramble(input, output, bits, scrTable);
23 // FFTRecursive(b, bits, wTable, 1, 1);
24 // Cleanup:
25 // free(scrTable);
26 // free(wTable);
27 //
28 // Note y[0..N-1] = IFFT(FFT(x[0..N-1])) = N * x[0..N-1]
29 */
30
31 #include <math.h>
32 #define PI (4.0 * atan(1.0))
33
34 unsigned* CreateScrambleTable(unsigned bits)
35 { unsigned *table, i, points;
36 points = 1 << bits;
37 table = (int*)malloc(sizeof(int) * points);
38 if(!table)
39 return 0;
40 for(i = 0; i < points; i++)
41 { register unsigned tmp = i;
42 tmp = (tmp & 0xAAAAAAAA) >> 1 | (tmp & 0x55555555) << 1;
43 tmp = (tmp & 0xCCCCCCCC) >> 2 | (tmp & 0x33333333) << 2;
44 tmp = (tmp & 0xF0F0F0F0) >> 4 | (tmp & 0x0F0F0F0F) << 4;
45 tmp = (tmp & 0xFF00FF00) >> 8 | (tmp & 0x00FF00FF) << 8;
46 tmp = (tmp & 0xFFFF0000) >> 16 | (tmp & 0x0000FFFF) << 16;
47 tmp >>= 32 - bits;
48 table[i] = tmp;
49 }
50 return table;
51 }
52
53 float* CreateCoefficientTable(unsigned bits)
54 { float *table, *w;
55 unsigned points, size;
56 points = 1 << bits;
57 table = (float*)malloc(sizeof(float) * 2 * points);
58 if(!table)
59 return 0;
60 w = table;
61 for(size = points; size > 0; size >>= 1)
62 { int k;
63 float p = 2.0 * PI / size;
64 for(k = 0; k < size >> 1; k++)
65 { *w++ = cos(k * p);
66 *w++ = -sin(k * p);
67 }
68 }
69 return table;
70 }
71
72 void Scramble(const float *in, float *out, unsigned bits, unsigned *scrTable)
73 { int dummy;
74 __asm__ __volatile__ ("
75 push %0
76 push %2
77 push %3
78 .loop%=:
79 push %0
80 movl (%3), %4
81 add $4, %3
82 movl (%3), %0
83 add $4, %3
84 movq (%1,%4,8), %%mm0
85 movq (%1,%0,8), %%mm1
86 movl (%3), %4
87 add $4, %3
88 movl (%3), %0
89 add $4, %3
90 movq (%1,%4,8), %%mm2
91 movq (%1,%0,8), %%mm3
92 movl (%3), %4
93 add $4, %3
94 movl (%3), %0
95 add $4, %3
96 movq (%1,%4,8), %%mm4
97 movq (%1,%0,8), %%mm5
98 movl (%3), %4
99 add $4, %3
100 movl (%3), %0
101 add $4, %3
102 movq (%1,%4,8), %%mm6
103 movq (%1,%0,8), %%mm7
104 movq %%mm0, (%2)
105 add $8, %2
106 movq %%mm1, (%2)
107 add $8, %2
108 movq %%mm2, (%2)
109 add $8, %2
110 movq %%mm3, (%2)
111 add $8, %2
112 movq %%mm4, (%2)
113 add $8, %2
114 movq %%mm5, (%2)
115 add $8, %2
116 movq %%mm6, (%2)
117 add $8, %2
118 movq %%mm7, (%2)
119 add $8, %2
120 pop %0
121 dec %0
122 jne .loop%=
123 femms
124 pop %3
125 pop %2
126 pop %0
127 ": : "q" (1 << (bits - 3)), "r" (in), "r" (out), "r" (scrTable), "q" (dummy)
128 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
129 );
130
131 }
132
133 void FFTRecursive(float *data, unsigned fftBits, const float *wsave, unsigned repeat, int reverse)
134 { unsigned fftPoints, fftHalfPoints;
135 int mask[2] = {0x80000000, 0x00000000};
136 if(fftBits == 1)
137 return;
138 fftPoints = 1 << fftBits;
139 fftHalfPoints = fftPoints >> 1;
140 if(fftBits > 11)
141 { FFTRecursive(data, fftBits - 1, wsave + fftHalfPoints, repeat, reverse);
142 FFTRecursive(data + fftHalfPoints, fftBits - 1, wsave + fftHalfPoints, repeat, reverse);
143 }
144 else
145 FFTRecursive(data, fftBits - 1, wsave + fftHalfPoints, repeat << 1, reverse);
146 if(reverse)
147 if(fftBits > 2)
148 { int dummy1, dummy2, dummy3, dummy4;
149 while(repeat--)
150 { __asm__ __volatile__ ("
151 .loop%=:
152 movq (%0), %%mm0
153 movq 8(%0), %%mm4
154 pswapd %%mm0, %%mm2
155 pswapd %%mm4, %%mm6
156 movq (%1), %%mm1
157 movq 8(%1), %%mm5
158 pfmul %%mm1, %%mm0
159 pfmul %%mm1, %%mm2
160 pfmul %%mm5, %%mm4
161 pfmul %%mm5, %%mm6
162 pfpnacc %%mm0, %%mm2
163 pfpnacc %%mm4, %%mm6
164 pswapd %%mm2, %%mm0
165 pswapd %%mm6, %%mm4
166 movq (%2), %%mm3
167 movq 8(%2), %%mm7
168 movq %%mm3, %%mm1
169 movq %%mm7, %%mm5
170 pfsub %%mm0, %%mm3
171 pfadd %%mm0, %%mm1
172 pfsub %%mm4, %%mm7
173 pfadd %%mm4, %%mm5
174 movq %%mm1, (%2)
175 movq %%mm5, 8(%2)
176 add $16, %1
177 add $16, %2
178 movq %%mm3, (%0)
179 movq %%mm7, 8(%0)
180 add $16, %0
181 dec %3
182 jne .loop%=
183 ": "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4)
184 : "0" (data + fftHalfPoints), "1" (wsave), "2" (data), "3" (fftHalfPoints >> 1)
185 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
186 );
187 data += fftPoints;
188 }
189 }
190 else
191 { __asm__ __volatile__ ("
192 push %0
193 push %1
194 movq (%2), %%mm7
195 .loop%=:
196 movq (%0), %%mm0
197 movq 8(%0), %%mm1
198 movq %%mm0, %%mm4
199 pfadd %%mm1, %%mm0
200 movq 16(%0), %%mm2
201 movq 24(%0), %%mm3
202 movq %%mm2, %%mm5
203 pfadd %%mm3, %%mm2
204 pfsub %%mm1, %%mm4
205 pfsub %%mm3, %%mm5
206 movq %%mm0, %%mm1
207 pfsub %%mm2, %%mm0
208 pfadd %%mm2, %%mm1
209 pswapd %%mm5, %%mm5
210 movq %%mm0, 16(%0)
211 movq %%mm1, (%0)
212 pxor %%mm7, %%mm5
213 movq %%mm4, %%mm0
214 pfadd %%mm5, %%mm0
215 pfsub %%mm5, %%mm4
216 movq %%mm0, 8(%0)
217 movq %%mm4, 24(%0)
218 add $32, %0
219 dec %1
220 jne .loop%=
221 pop %1
222 pop %0
223 ": : "r" (data), "q" (repeat), "r" (mask)
224 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
225 );
226 }
227 else
228 if(fftBits > 2)
229 { int dummy1, dummy2, dummy3, dummy4;
230 while(repeat--)
231 { __asm__ __volatile__ ("
232 .loop%=:
233 movq (%0), %%mm0
234 movq 8(%0), %%mm4
235 pswapd %%mm0, %%mm2
236 pswapd %%mm4, %%mm6
237 movq (%1), %%mm1
238 movq 8(%1), %%mm5
239 pfmul %%mm1, %%mm0
240 pfmul %%mm1, %%mm2
241 pfmul %%mm5, %%mm4
242 pfmul %%mm5, %%mm6
243 pfpnacc %%mm2, %%mm0
244 pfpnacc %%mm6, %%mm4
245 movq (%2), %%mm3
246 movq 8(%2), %%mm7
247 movq %%mm3, %%mm1
248 movq %%mm7, %%mm5
249 pfsub %%mm0, %%mm3
250 pfadd %%mm0, %%mm1
251 pfsub %%mm4, %%mm7
252 pfadd %%mm4, %%mm5
253 movq %%mm1, (%2)
254 movq %%mm5, 8(%2)
255 add $16, %1
256 add $16, %2
257 movq %%mm3, (%0)
258 movq %%mm7, 8(%0)
259 add $16, %0
260 dec %3
261 jne .loop%=
262 ": "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4)
263 : "0" (data + fftHalfPoints), "1" (wsave), "2" (data), "3" (fftHalfPoints >> 1)
264 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
265 );
266
267 data += fftPoints;
268 }
269 }
270 else
271 { __asm__ __volatile__ ("
272 push %0
273 push %1
274 movq (%2), %%mm7
275 .loop%=:
276 movq (%0), %%mm0
277 movq 8(%0), %%mm1
278 movq %%mm0, %%mm4
279 pfadd %%mm1, %%mm0
280 movq 16(%0), %%mm2
281 movq 24(%0), %%mm3
282 movq %%mm2, %%mm5
283 pfadd %%mm3, %%mm2
284 pfsub %%mm1, %%mm4
285 pfsub %%mm3, %%mm5
286 movq %%mm0, %%mm1
287 pfsub %%mm2, %%mm0
288 pfadd %%mm2, %%mm1
289 pswapd %%mm5, %%mm5
290 movq %%mm0, 16(%0)
291 movq %%mm1, (%0)
292 pxor %%mm7, %%mm5
293 movq %%mm4, %%mm0
294 pfadd %%mm5, %%mm0
295 pfsub %%mm5, %%mm4
296 movq %%mm0, 24(%0)
297 movq %%mm4, 8(%0)
298 add $32, %0
299 dec %1
300 jne .loop%=
301 pop %1
302 pop %0
303 ": : "r" (data), "q" (repeat), "r" (mask)
304 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
305 );
306 }
307 __asm__ __volatile__ ("femms": :
308 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
309 );
310 }
311