Du bist hier: Snippet-Verzeichnis » C/C++ (495)
Sprache:

3D Now! FFT implementation

Sprache: English
Programmiersprache: C
Veröffentlicht von: jmvalin [nicht registriert]
Letzte Änderung: 15.05.2006
Aufrufe: 1015


Beschreibung

A (very) Fast Fourier Transform (FFT) implementation using 3DNow! assembly. On my Athlon 1.33, I get 1.6 gflops performance. Note: AMD Athlon only (not K6-II)

Code

1 /* 2 // 3dNow-optimized FFT written by Jean-Marc Valin. 3 // Inverse FFT derived by Mike DeSimone. 4 // Bit reverse algorithm by Mike DeSimone. 5 // Copyright 2001, 2002 Jean-Marc Valin and Mike DeSimone 6 // 7 // This software is released under version 2.1 of the GNU Lesser General 8 // Public License. 9 // 10 // I declared all complex arrays as float*. Internal format is: 11 // struct { float re; float im; } 12 // Use whatever similar structure fits your needs. I'm a C++ programmer, 13 // so I use complex<float>. 14 // 15 // Initialization: 16 // int* scrTable = CreateScrambleTable(bits); 17 // float* wTable = CreateCoefficientTable(bits); 18 // Forward FFT: 19 // Scramble(input, output, bits, scrTable); 20 // FFTRecursive(b, bits, wTable, 1, 0); 21 // Inverse FFT: 22 // Scramble(input, output, bits, scrTable); 23 // FFTRecursive(b, bits, wTable, 1, 1); 24 // Cleanup: 25 // free(scrTable); 26 // free(wTable); 27 // 28 // Note y[0..N-1] = IFFT(FFT(x[0..N-1])) = N * x[0..N-1] 29 */ 30 31 #include <math.h> 32 #define PI (4.0 * atan(1.0)) 33 34 unsigned* CreateScrambleTable(unsigned bits) 35 { unsigned *table, i, points; 36 points = 1 << bits; 37 table = (int*)malloc(sizeof(int) * points); 38 if(!table) 39 return 0; 40 for(i = 0; i < points; i++) 41 { register unsigned tmp = i; 42 tmp = (tmp & 0xAAAAAAAA) >> 1 | (tmp & 0x55555555) << 1; 43 tmp = (tmp & 0xCCCCCCCC) >> 2 | (tmp & 0x33333333) << 2; 44 tmp = (tmp & 0xF0F0F0F0) >> 4 | (tmp & 0x0F0F0F0F) << 4; 45 tmp = (tmp & 0xFF00FF00) >> 8 | (tmp & 0x00FF00FF) << 8; 46 tmp = (tmp & 0xFFFF0000) >> 16 | (tmp & 0x0000FFFF) << 16; 47 tmp >>= 32 - bits; 48 table[i] = tmp; 49 } 50 return table; 51 } 52 53 float* CreateCoefficientTable(unsigned bits) 54 { float *table, *w; 55 unsigned points, size; 56 points = 1 << bits; 57 table = (float*)malloc(sizeof(float) * 2 * points); 58 if(!table) 59 return 0; 60 w = table; 61 for(size = points; size > 0; size >>= 1) 62 { int k; 63 float p = 2.0 * PI / size; 64 for(k = 0; k < size >> 1; k++) 65 { *w++ = cos(k * p); 66 *w++ = -sin(k * p); 67 } 68 } 69 return table; 70 } 71 72 void Scramble(const float *in, float *out, unsigned bits, unsigned *scrTable) 73 { int dummy; 74 __asm__ __volatile__ (" 75 push %0 76 push %2 77 push %3 78 .loop%=: 79 push %0 80 movl (%3), %4 81 add $4, %3 82 movl (%3), %0 83 add $4, %3 84 movq (%1,%4,8), %%mm0 85 movq (%1,%0,8), %%mm1 86 movl (%3), %4 87 add $4, %3 88 movl (%3), %0 89 add $4, %3 90 movq (%1,%4,8), %%mm2 91 movq (%1,%0,8), %%mm3 92 movl (%3), %4 93 add $4, %3 94 movl (%3), %0 95 add $4, %3 96 movq (%1,%4,8), %%mm4 97 movq (%1,%0,8), %%mm5 98 movl (%3), %4 99 add $4, %3 100 movl (%3), %0 101 add $4, %3 102 movq (%1,%4,8), %%mm6 103 movq (%1,%0,8), %%mm7 104 movq %%mm0, (%2) 105 add $8, %2 106 movq %%mm1, (%2) 107 add $8, %2 108 movq %%mm2, (%2) 109 add $8, %2 110 movq %%mm3, (%2) 111 add $8, %2 112 movq %%mm4, (%2) 113 add $8, %2 114 movq %%mm5, (%2) 115 add $8, %2 116 movq %%mm6, (%2) 117 add $8, %2 118 movq %%mm7, (%2) 119 add $8, %2 120 pop %0 121 dec %0 122 jne .loop%= 123 femms 124 pop %3 125 pop %2 126 pop %0 127 ": : "q" (1 << (bits - 3)), "r" (in), "r" (out), "r" (scrTable), "q" (dummy) 128 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" 129 ); 130 131 } 132 133 void FFTRecursive(float *data, unsigned fftBits, const float *wsave, unsigned repeat, int reverse) 134 { unsigned fftPoints, fftHalfPoints; 135 int mask[2] = {0x80000000, 0x00000000}; 136 if(fftBits == 1) 137 return; 138 fftPoints = 1 << fftBits; 139 fftHalfPoints = fftPoints >> 1; 140 if(fftBits > 11) 141 { FFTRecursive(data, fftBits - 1, wsave + fftHalfPoints, repeat, reverse); 142 FFTRecursive(data + fftHalfPoints, fftBits - 1, wsave + fftHalfPoints, repeat, reverse); 143 } 144 else 145 FFTRecursive(data, fftBits - 1, wsave + fftHalfPoints, repeat << 1, reverse); 146 if(reverse) 147 if(fftBits > 2) 148 { int dummy1, dummy2, dummy3, dummy4; 149 while(repeat--) 150 { __asm__ __volatile__ (" 151 .loop%=: 152 movq (%0), %%mm0 153 movq 8(%0), %%mm4 154 pswapd %%mm0, %%mm2 155 pswapd %%mm4, %%mm6 156 movq (%1), %%mm1 157 movq 8(%1), %%mm5 158 pfmul %%mm1, %%mm0 159 pfmul %%mm1, %%mm2 160 pfmul %%mm5, %%mm4 161 pfmul %%mm5, %%mm6 162 pfpnacc %%mm0, %%mm2 163 pfpnacc %%mm4, %%mm6 164 pswapd %%mm2, %%mm0 165 pswapd %%mm6, %%mm4 166 movq (%2), %%mm3 167 movq 8(%2), %%mm7 168 movq %%mm3, %%mm1 169 movq %%mm7, %%mm5 170 pfsub %%mm0, %%mm3 171 pfadd %%mm0, %%mm1 172 pfsub %%mm4, %%mm7 173 pfadd %%mm4, %%mm5 174 movq %%mm1, (%2) 175 movq %%mm5, 8(%2) 176 add $16, %1 177 add $16, %2 178 movq %%mm3, (%0) 179 movq %%mm7, 8(%0) 180 add $16, %0 181 dec %3 182 jne .loop%= 183 ": "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4) 184 : "0" (data + fftHalfPoints), "1" (wsave), "2" (data), "3" (fftHalfPoints >> 1) 185 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" 186 ); 187 data += fftPoints; 188 } 189 } 190 else 191 { __asm__ __volatile__ (" 192 push %0 193 push %1 194 movq (%2), %%mm7 195 .loop%=: 196 movq (%0), %%mm0 197 movq 8(%0), %%mm1 198 movq %%mm0, %%mm4 199 pfadd %%mm1, %%mm0 200 movq 16(%0), %%mm2 201 movq 24(%0), %%mm3 202 movq %%mm2, %%mm5 203 pfadd %%mm3, %%mm2 204 pfsub %%mm1, %%mm4 205 pfsub %%mm3, %%mm5 206 movq %%mm0, %%mm1 207 pfsub %%mm2, %%mm0 208 pfadd %%mm2, %%mm1 209 pswapd %%mm5, %%mm5 210 movq %%mm0, 16(%0) 211 movq %%mm1, (%0) 212 pxor %%mm7, %%mm5 213 movq %%mm4, %%mm0 214 pfadd %%mm5, %%mm0 215 pfsub %%mm5, %%mm4 216 movq %%mm0, 8(%0) 217 movq %%mm4, 24(%0) 218 add $32, %0 219 dec %1 220 jne .loop%= 221 pop %1 222 pop %0 223 ": : "r" (data), "q" (repeat), "r" (mask) 224 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" 225 ); 226 } 227 else 228 if(fftBits > 2) 229 { int dummy1, dummy2, dummy3, dummy4; 230 while(repeat--) 231 { __asm__ __volatile__ (" 232 .loop%=: 233 movq (%0), %%mm0 234 movq 8(%0), %%mm4 235 pswapd %%mm0, %%mm2 236 pswapd %%mm4, %%mm6 237 movq (%1), %%mm1 238 movq 8(%1), %%mm5 239 pfmul %%mm1, %%mm0 240 pfmul %%mm1, %%mm2 241 pfmul %%mm5, %%mm4 242 pfmul %%mm5, %%mm6 243 pfpnacc %%mm2, %%mm0 244 pfpnacc %%mm6, %%mm4 245 movq (%2), %%mm3 246 movq 8(%2), %%mm7 247 movq %%mm3, %%mm1 248 movq %%mm7, %%mm5 249 pfsub %%mm0, %%mm3 250 pfadd %%mm0, %%mm1 251 pfsub %%mm4, %%mm7 252 pfadd %%mm4, %%mm5 253 movq %%mm1, (%2) 254 movq %%mm5, 8(%2) 255 add $16, %1 256 add $16, %2 257 movq %%mm3, (%0) 258 movq %%mm7, 8(%0) 259 add $16, %0 260 dec %3 261 jne .loop%= 262 ": "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4) 263 : "0" (data + fftHalfPoints), "1" (wsave), "2" (data), "3" (fftHalfPoints >> 1) 264 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" 265 ); 266 267 data += fftPoints; 268 } 269 } 270 else 271 { __asm__ __volatile__ (" 272 push %0 273 push %1 274 movq (%2), %%mm7 275 .loop%=: 276 movq (%0), %%mm0 277 movq 8(%0), %%mm1 278 movq %%mm0, %%mm4 279 pfadd %%mm1, %%mm0 280 movq 16(%0), %%mm2 281 movq 24(%0), %%mm3 282 movq %%mm2, %%mm5 283 pfadd %%mm3, %%mm2 284 pfsub %%mm1, %%mm4 285 pfsub %%mm3, %%mm5 286 movq %%mm0, %%mm1 287 pfsub %%mm2, %%mm0 288 pfadd %%mm2, %%mm1 289 pswapd %%mm5, %%mm5 290 movq %%mm0, 16(%0) 291 movq %%mm1, (%0) 292 pxor %%mm7, %%mm5 293 movq %%mm4, %%mm0 294 pfadd %%mm5, %%mm0 295 pfsub %%mm5, %%mm4 296 movq %%mm0, 24(%0) 297 movq %%mm4, 8(%0) 298 add $32, %0 299 dec %1 300 jne .loop%= 301 pop %1 302 pop %0 303 ": : "r" (data), "q" (repeat), "r" (mask) 304 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" 305 ); 306 } 307 __asm__ __volatile__ ("femms": : 308 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" 309 ); 310 } 311

Noch kein Kommentar vorhanden

Dieses Snippet kommentieren

Name *  

E-Mail (wird nicht angezeigt) *    

Website  

Kommentar *  

Sicherheitscode Sicherheitscode *    

RSS