Du bist hier: Snippet-Verzeichnis » C/C++ (495)
Sprache:

3D Now! FFT implementation

Sprache: English
Programmiersprache: C
Veröffentlicht von: jmvalin [nicht registriert]
Letzte Änderung: 15.05.2006
Aufrufe: 955


Beschreibung

A (very) Fast Fourier Transform (FFT) implementation using 3DNow! assembly. On my Athlon 1.33, I get 1.6 gflops performance. Note: AMD Athlon only (not K6-II)

Code

1 /* Copyright Jean-Marc Valin 2001 */ 2 /* This software is released under the version 2.1 of the GNU Lesser General*/ 3 4 5 /*Example: for an (out-of-place) 256 transform: 6 COMPLEX in[256]; 7 COMPLEX out[256]; 8 COMPLEX w[256]; 9 int bits[256]; 10 fft_initCosSinTables(w, bits, 8); 11 void fft(in, out, 8, w, bits); 12 13 Note: fft_initCosSinTables only need to be called once for each size 14 8 is the order of the FFT (2^8=256) 15 */ 16 17 #include <math.h> 18 19 20 typedef struct COMPLEX { 21 float re; 22 float im; 23 } COMPLEX; 24 25 void fft_initCosSinTables(COMPLEX *w, int *bits, int M) 26 { 27 int i,j; 28 int tmp; 29 int size = 1 << M; 30 for (i=0;i<size;i++) 31 { 32 bits[i]=0; 33 tmp=i; 34 for (j=0;j<M;j++) 35 { 36 bits[i] <<= 1; 37 bits[i] += tmp&1; 38 tmp>>=1; 39 } 40 } 41 42 while (size) 43 { 44 int k; 45 float tmp, p = (2.0 * M_PI) / size; 46 COMPLEX tmp2; 47 for (k = 0; k < (size>>1); k++) { 48 tmp = k * p; 49 tmp2.re=cos(tmp); 50 tmp2.im=-sin(tmp); 51 *w++ = tmp2; 52 } 53 size >>= 1; 54 } 55 } 56 inline void bitrev(COMPLEX *in, COMPLEX *out, int *bits, int M) 57 { 58 int dummy; 59 int size = 1 << M; 60 61 __asm__ __volatile__ ( 62 " 63 push %0 64 push %2 65 push %3 66 .loop%=: 67 push %0 68 69 70 movl (%3), %4 71 add $4, %3 72 movl (%3), %0 73 add $4, %3 74 movq (%1,%4,8), %%mm0 75 movq (%1,%0,8), %%mm1 76 77 movl (%3), %4 78 add $4, %3 79 movl (%3), %0 80 add $4, %3 81 movq (%1,%4,8), %%mm2 82 movq (%1,%0,8), %%mm3 83 84 85 movl (%3), %4 86 add $4, %3 87 movl (%3), %0 88 add $4, %3 89 movq (%1,%4,8), %%mm4 90 movq (%1,%0,8), %%mm5 91 92 movl (%3), %4 93 add $4, %3 94 movl (%3), %0 95 add $4, %3 96 movq (%1,%4,8), %%mm6 97 movq (%1,%0,8), %%mm7 98 99 movq %%mm0, (%2) 100 add $8, %2 101 movq %%mm1, (%2) 102 add $8, %2 103 movq %%mm2, (%2) 104 add $8, %2 105 movq %%mm3, (%2) 106 add $8, %2 107 108 109 movq %%mm4, (%2) 110 add $8, %2 111 movq %%mm5, (%2) 112 add $8, %2 113 movq %%mm6, (%2) 114 add $8, %2 115 movq %%mm7, (%2) 116 add $8, %2 117 118 119 pop %0 120 121 dec %0 122 jne .loop%= 123 femms 124 pop %3 125 pop %2 126 pop %0 127 " : : "q" (size>>3), "r" (in), "r" (out), "r" (bits), "q" (dummy) 128 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 129 130 } 131 132 inline void recurs_fft(COMPLEX *x, int M, COMPLEX *w, int repeat) 133 { 134 int dummy1, dummy2, dummy3, dummy4; 135 int N = 1 << M; 136 int N2 = N >> 1; 137 int mask = N-1; 138 if (M==1) 139 return; 140 if (M>11) 141 { 142 recurs_fft(x, M-1, w+N2, repeat); 143 recurs_fft(x+N2, M-1, w+N2, repeat); 144 } else { 145 recurs_fft(x, M-1, w+N2, repeat<<1); 146 } 147 if (M>2) 148 { 149 while (repeat--) 150 { 151 __asm__ __volatile__ ( 152 " 153 .loop%=: 154 movq (%0), %%mm0 155 movq 8(%0), %%mm4 156 pswapd %%mm0, %%mm2 157 pswapd %%mm4, %%mm6 158 movq (%1), %%mm1 159 movq 8(%1), %%mm5 160 pfmul %%mm1, %%mm0 161 pfmul %%mm1, %%mm2 162 pfmul %%mm5, %%mm4 163 pfmul %%mm5, %%mm6 164 pfpnacc %%mm2, %%mm0 165 pfpnacc %%mm6, %%mm4 166 movq (%2), %%mm3 167 movq 8(%2), %%mm7 168 movq %%mm3, %%mm1 169 movq %%mm7, %%mm5 170 pfsub %%mm0, %%mm3 171 pfadd %%mm0, %%mm1 172 pfsub %%mm4, %%mm7 173 pfadd %%mm4, %%mm5 174 movq %%mm1, (%2) 175 movq %%mm5, 8(%2) 176 add $16, %1 177 add $16, %2 178 movq %%mm3, (%0) 179 movq %%mm7, 8(%0) 180 add $16, %0 181 dec %3 182 jne .loop%= 183 " 184 : "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4) : "0" (x+N2), "1" (w), "2" (x), "3" (N2>>1) 185 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 186 187 x+=N; 188 } 189 } else { 190 int mask[2]={0x80000000, 0x00000000}; 191 __asm__ __volatile__ ( 192 " 193 push %0 194 push %1 195 movq (%2), %%mm7 196 .loop%=: 197 movq (%0), %%mm0 198 movq 8(%0), %%mm1 199 200 movq %%mm0, %%mm4 201 pfadd %%mm1, %%mm0 202 203 movq 16(%0), %%mm2 204 movq 24(%0), %%mm3 205 206 movq %%mm2, %%mm5 207 pfadd %%mm3, %%mm2 208 209 pfsub %%mm1, %%mm4 210 pfsub %%mm3, %%mm5 211 212 movq %%mm0, %%mm1 213 pfsub %%mm2, %%mm0 214 pfadd %%mm2, %%mm1 215 pswapd %%mm5, %%mm5 216 movq %%mm0, 16(%0) 217 movq %%mm1, (%0) 218 219 pxor %%mm7, %%mm5 220 movq %%mm4, %%mm0 221 pfadd %%mm5, %%mm0 222 pfsub %%mm5, %%mm4 223 224 movq %%mm0, 24(%0) 225 movq %%mm4, 8(%0) 226 227 add $32, %0 228 229 dec %1 230 jne .loop%= 231 pop %1 232 pop %0 233 " 234 : : "r" (x), "q" (repeat), "r" (mask) 235 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 236 237 } 238 __asm__ __volatile__ ("femms" 239 : : 240 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 241 242 } 243 244 245 void fft(COMPLEX *in, COMPLEX *out, int M, COMPLEX *w, int *bits) 246 { 247 bitrev(in, out, bits, M); 248 recurs_fft(out, M, w, 1); 249 } 250

Noch kein Kommentar vorhanden

Dieses Snippet kommentieren

Name *  

E-Mail (wird nicht angezeigt) *    

Website  

Kommentar *  

Sicherheitscode Sicherheitscode *    

RSS