Du bist hier: Snippet-Verzeichnis » C/C++ (495)
Sprache:

3D Now! FFT implementation

Sprache: English
Programmiersprache: C
Veröffentlicht von: jmvalin [nicht registriert]
Letzte Änderung: 15.05.2006
Aufrufe: 1018


Beschreibung

A (very) Fast Fourier Transform (FFT) implementation using 3DNow! assembly. On my Athlon 1.33, I get 1.6 gflops performance. Note: AMD Athlon only (not K6-II)

Code

1 /* Copyright Jean-Marc Valin 2001 */ 2 /* This software is released under the version 2.1 of the 3 GNU Lesser General Public License (LGPL)*/ 4 5 /*Example: for an (out-of-place) 256 transform: 6 COMPLEX in[256]; 7 COMPLEX out[256]; 8 COMPLEX w[256]; 9 int bits[256]; 10 fft_initCosSinTables(w, bits, 8); 11 void fft(in, out, 8, w, bits); 12 13 Note: fft_initCosSinTables only need to be called once for each size 14 8 is the order of the FFT (2^8=256) 15 */ 16 17 #include <math.h> 18 19 typedef struct COMPLEX { 20 float re; 21 float im; 22 } COMPLEX; 23 24 void fft_initCosSinTables(COMPLEX *w, int *bits, int M) 25 { 26 int i,j; 27 int tmp; 28 int size = 1 << M; 29 for (i=0;i<size;i++) 30 { 31 bits[i]=0; 32 tmp=i; 33 for (j=0;j<M;j++) 34 { 35 bits[i] <<= 1; 36 bits[i] += tmp&1; 37 tmp>>=1; 38 } 39 } 40 41 while (size) 42 { 43 int k; 44 float tmp, p = (2.0 * M_PI) / size; 45 COMPLEX tmp2; 46 for (k = 0; k < (size>>1); k++) { 47 tmp = k * p; 48 tmp2.re=cos(tmp); 49 tmp2.im=-sin(tmp); 50 *w++ = tmp2; 51 } 52 size >>= 1; 53 } 54 } 55 void bitrev(COMPLEX *in, COMPLEX *out, int *bits, int M) 56 { 57 int dummy; 58 int size = 1 << M; 59 60 __asm__ __volatile__ ( 61 " 62 push %0 63 push %2 64 push %3 65 .loop%=: 66 movl (%3), %4 67 movq (%1,%4,8), %%mm0 68 movl 4(%3), %4 69 movq (%1,%4,8), %%mm1 70 movq %%mm0, (%2) 71 movq %%mm1, 8(%2) 72 add $8, %3 73 add $16, %2 74 dec %0 75 jne .loop%= 76 femms 77 pop %3 78 pop %2 79 pop %0 80 " : : "q" (size>>1), "r" (in), "r" (out), "r" (bits), "q" (dummy) 81 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 82 83 } 84 85 void recurs_fft(COMPLEX *x, int M, COMPLEX *w, int repeat) 86 { 87 int dummy1, dummy2, dummy3, dummy4; 88 int N = 1 << M; 89 int N2 = N >> 1; 90 int mask = N-1; 91 if (M==0) 92 return; 93 recurs_fft(x, M-1, w+N2, repeat<<1); 94 95 if (M>2) 96 { 97 while (repeat--) 98 { 99 __asm__ __volatile__ ( 100 " 101 .loop%=: 102 movq (%0), %%mm0 103 pswapd %%mm0, %%mm2 104 movq 8(%0), %%mm4 105 pswapd %%mm4, %%mm6 106 movq (%1), %%mm1 107 movq 8(%1), %%mm5 108 pfmul %%mm1, %%mm0 109 pfmul %%mm1, %%mm2 110 movq (%2), %%mm3 111 movq 8(%2), %%mm7 112 pfmul %%mm5, %%mm4 113 pfmul %%mm5, %%mm6 114 movq %%mm3, %%mm1 115 movq %%mm7, %%mm5 116 pfpnacc %%mm2, %%mm0 117 pfpnacc %%mm6, %%mm4 118 pfsub %%mm0, %%mm3 119 pfadd %%mm0, %%mm1 120 pfsub %%mm4, %%mm7 121 pfadd %%mm4, %%mm5 122 movq %%mm1, (%2) 123 movq %%mm3, (%0) 124 movq %%mm5, 8(%2) 125 movq %%mm7, 8(%0) 126 add $16, %0 127 add $16, %1 128 add $16, %2 129 dec %3 130 jne .loop%= 131 " 132 : "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=q" (dummy4) : "0" (x+N2), "1" (w), "2" (x), "3" (N2>>1) 133 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 134 135 x+=N; 136 } 137 } else if (M==2) 138 { 139 __asm__ __volatile__ ( 140 " 141 push %0 142 push %2 143 push %3 144 .loop%=: 145 movq (%0), %%mm0 146 movq 8(%0), %%mm4 147 movq (%1), %%mm1 148 movq 8(%1), %%mm5 149 pswapd %%mm0, %%mm2 150 pswapd %%mm4, %%mm6 151 movq (%2), %%mm3 152 movq 8(%2), %%mm7 153 pfmul %%mm1, %%mm0 154 pfmul %%mm1, %%mm2 155 pfmul %%mm5, %%mm4 156 pfmul %%mm5, %%mm6 157 pfpnacc %%mm2, %%mm0 158 pfpnacc %%mm6, %%mm4 159 movq %%mm3, %%mm1 160 movq %%mm7, %%mm5 161 pfsub %%mm0, %%mm3 162 pfadd %%mm0, %%mm1 163 pfsub %%mm4, %%mm7 164 pfadd %%mm4, %%mm5 165 movq %%mm1, (%2) 166 movq %%mm3, (%0) 167 movq %%mm5, 8(%2) 168 movq %%mm7, 8(%0) 169 add $32, %0 170 add $32, %2 171 dec %3 172 jne .loop%= 173 pop %3 174 pop %2 175 pop %0 176 " 177 : : "r" (x+2), "r" (w), "r" (x), "q" (repeat) 178 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 179 180 } else { 181 __asm__ __volatile__ ( 182 " 183 push %0 184 push %1 185 .loop%=: 186 movq (%0), %%mm0 187 movq 8(%0), %%mm1 188 movq 16(%0), %%mm2 189 movq 24(%0), %%mm3 190 191 movq %%mm0, %%mm4 192 pfadd %%mm1, %%mm0 193 194 movq %%mm2, %%mm5 195 pfadd %%mm3, %%mm2 196 197 pfsub %%mm1, %%mm4 198 pfsub %%mm3, %%mm5 199 200 movq %%mm0, (%0) 201 movq %%mm4, 8(%0) 202 movq %%mm2, 16(%0) 203 movq %%mm5, 24(%0) 204 add $32, %0 205 206 dec %1 207 jne .loop%= 208 pop %1 209 pop %0 210 " 211 : : "r" (x), "q" (repeat>>1) 212 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 213 214 } 215 __asm__ __volatile__ ("femms" 216 : : 217 : "memory", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); 218 219 } 220 221 222 void fft(COMPLEX *in, COMPLEX *out, int M, COMPLEX *w, int *bits) 223 { 224 bitrev(in, out, bits, M); 225 recurs_fft(out, M, w, 1); 226 } 227

Noch kein Kommentar vorhanden

Dieses Snippet kommentieren

Name *  

E-Mail (wird nicht angezeigt) *    

Website  

Kommentar *  

Sicherheitscode Sicherheitscode *    

RSS