1 /* Copyright Jean-Marc Valin 2001 */
2 /* This software is released under the version 2.1 of the
3 GNU Lesser General Public License (LGPL)*/
4
5 /*Example: for an (out-of-place) 256 transform:
6 COMPLEX in[256];
7 COMPLEX out[256];
8 COMPLEX w[256];
9 int bits[256];
10 fft_initCosSinTables(w, bits, 8);
11 void fft(in, out, 8, w, bits);
12
13 Note: fft_initCosSinTables only need to be called once for each size
14 8 is the order of the FFT (2^8=256)
15 */
16
17 #include <math.h>
18
19 typedef struct COMPLEX {
20 float re;
21 float im;
22 } COMPLEX;
23
24 void fft_initCosSinTables(COMPLEX *w, int *bits, int M)
25 {
26 int i,j;
27 int tmp;
28 int size = 1 << M;
29 for (i=0;i<size;i++)
30 {
31 bits[i]=0;
32 tmp=i;
33 for (j=0;j<M;j++)
34 {
35 bits[i] <<= 1;
36 bits[i] += tmp&1;
37 tmp>>=1;
38 }
39 }
40
41 while (size)
42 {
43 int k;
44 float tmp, p = (2.0 * M_PI) / size;
45 COMPLEX tmp2;
46 for (k = 0; k < (size>>1); k++) {
47 tmp = k * p;
48 tmp2.re=cos(tmp);
49 tmp2.im=-sin(tmp);
50 *w++ = tmp2;
51 }
52 size >>= 1;
53 }
54 }
55 void bitrev(COMPLEX *in, COMPLEX *out, int *bits, int M)
56 {
57 int dummy;
58 int size = 1 << M;
59
60 __asm__ __volatile__ (
61 "
62 push %0
63 push %2
64 push %3
65 .loop%=:
66 movl (%3), %4
67 movq (%1,%4,8), %%mm0
68 movl 4(%3), %4
69 movq (%1,%4,8), %%mm1
70 movq %%mm0, (%2)
71 movq %%mm1, 8(%2)
72 add $8, %3
73 add $16, %2
74 dec %0
75 jne .loop%=
76 femms
77 pop %3
78 pop %2
79 pop %0
80 " : : "q" (size>>1), "r" (in), "r" (out), "r" (bits), "q" (dummy)
81 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
82
83 }
84
85 void recurs_fft(COMPLEX *x, int M, COMPLEX *w, int repeat)
86 {
87 int dummy1, dummy2, dummy3, dummy4;
88 int N = 1 << M;
89 int N2 = N >> 1;
90 int mask = N-1;
91 if (M==0)
92 return;
93 recurs_fft(x, M-1, w+N2, repeat<<1);
94
95 if (M>2)
96 {
97 while (repeat--)
98 {
99 __asm__ __volatile__ (
100 "
101 .loop%=:
102 movq (%0), %%mm0
103 pswapd %%mm0, %%mm2
104 movq 8(%0), %%mm4
105 pswapd %%mm4, %%mm6
106 movq (%1), %%mm1
107 movq 8(%1), %%mm5
108 pfmul %%mm1, %%mm0
109 pfmul %%mm1, %%mm2
110 movq (%2), %%mm3
111 movq 8(%2), %%mm7
112 pfmul %%mm5, %%mm4
113 pfmul %%mm5, %%mm6
114 movq %%mm3, %%mm1
115 movq %%mm7, %%mm5
116 pfpnacc %%mm2, %%mm0
117 pfpnacc %%mm6, %%mm4
118 pfsub %%mm0, %%mm3
119 pfadd %%mm0, %%mm1
120 pfsub %%mm4, %%mm7
121 pfadd %%mm4, %%mm5
122 movq %%mm1, (%2)
123 movq %%mm3, (%0)
124 movq %%mm5, 8(%2)
125 movq %%mm7, 8(%0)
126 add $16, %0
127 add $16, %1
128 add $16, %2
129 dec %3
130 jne .loop%=
131 "
132 : "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4) : "0" (x+N2), "1" (w), "2" (x), "3" (N2>>1)
133 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
134
135 x+=N;
136 }
137 } else if (M==2)
138 {
139 __asm__ __volatile__ (
140 "
141 push %0
142 push %2
143 push %3
144 .loop%=:
145 movq (%0), %%mm0
146 movq 8(%0), %%mm4
147 movq (%1), %%mm1
148 movq 8(%1), %%mm5
149 pswapd %%mm0, %%mm2
150 pswapd %%mm4, %%mm6
151 movq (%2), %%mm3
152 movq 8(%2), %%mm7
153 pfmul %%mm1, %%mm0
154 pfmul %%mm1, %%mm2
155 pfmul %%mm5, %%mm4
156 pfmul %%mm5, %%mm6
157 pfpnacc %%mm2, %%mm0
158 pfpnacc %%mm6, %%mm4
159 movq %%mm3, %%mm1
160 movq %%mm7, %%mm5
161 pfsub %%mm0, %%mm3
162 pfadd %%mm0, %%mm1
163 pfsub %%mm4, %%mm7
164 pfadd %%mm4, %%mm5
165 movq %%mm1, (%2)
166 movq %%mm3, (%0)
167 movq %%mm5, 8(%2)
168 movq %%mm7, 8(%0)
169 add $32, %0
170 add $32, %2
171 dec %3
172 jne .loop%=
173 pop %3
174 pop %2
175 pop %0
176 "
177 : : "r" (x+2), "r" (w), "r" (x), "q" (repeat)
178 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
179
180 } else {
181 __asm__ __volatile__ (
182 "
183 push %0
184 push %1
185 .loop%=:
186 movq (%0), %%mm0
187 movq 8(%0), %%mm1
188 movq 16(%0), %%mm2
189 movq 24(%0), %%mm3
190
191 movq %%mm0, %%mm4
192 pfadd %%mm1, %%mm0
193
194 movq %%mm2, %%mm5
195 pfadd %%mm3, %%mm2
196
197 pfsub %%mm1, %%mm4
198 pfsub %%mm3, %%mm5
199
200 movq %%mm0, (%0)
201 movq %%mm4, 8(%0)
202 movq %%mm2, 16(%0)
203 movq %%mm5, 24(%0)
204 add $32, %0
205
206 dec %1
207 jne .loop%=
208 pop %1
209 pop %0
210 "
211 : : "r" (x), "q" (repeat>>1)
212 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
213
214 }
215 __asm__ __volatile__ ("femms"
216 : :
217 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
218
219 }
220
221
222 void fft(COMPLEX *in, COMPLEX *out, int M, COMPLEX *w, int *bits)
223 {
224 bitrev(in, out, bits, M);
225 recurs_fft(out, M, w, 1);
226 }
227