Du bist hier: Snippet-Verzeichnis » C/C++ (495)
Sprache:

3D Now! vector primitives

Sprache: English
Programmiersprache: C++
Veröffentlicht von: jmvalin [nicht registriert]
Letzte Änderung: 15.05.2006
Aufrufe: 928


Beschreibung

Vector primitives (add, multiply, inner product, distance, ...) available as cross-platform template code and 3D Now! optimized. Part of the Overflow (aka Open Mind Speech) project.

Code

1 #include <math.h> 2 3 template <class T> 4 inline void vec_copy(const T *x, T *y, int len) 5 { 6 while (len > 3) 7 { 8 *y++ = *x++; 9 *y++ = *x++; 10 *y++ = *x++; 11 *y++ = *x++; 12 len -= 4; 13 } 14 while (len) 15 { 16 *y++ = *x++; 17 len--; 18 } 19 } 20 21 22 template <class T> 23 inline T vec_inner_prod(const T *a, const T *b, int len) 24 { 25 T sum1=0, sum2=0, sum3=0, sum4=0; 26 const T *end = a+len; 27 while (a<end-3) 28 { 29 sum1+=a[0]*b[0]; 30 sum2+=a[1]*b[1]; 31 sum3+=a[2]*b[2]; 32 sum4+=a[3]*b[3]; 33 a+=4; 34 b+=4; 35 } 36 while (a<end) 37 { 38 sum1+=a[0]*b[0]; 39 a++; b++; 40 } 41 return (sum1+sum2)+(sum3+sum4); 42 } 43 44 template <class T> 45 inline void vec_add_vec(const T *a, const T *b, T *c, int len) 46 { 47 const T *end = a+len; 48 while (a<end-3) 49 { 50 c[0]=a[0]+b[0]; 51 c[1]=a[1]+b[1]; 52 c[2]=a[2]+b[2]; 53 c[3]=a[3]+b[3]; 54 a+=4; 55 b+=4; 56 c+=4; 57 } 58 while (a<end) 59 { 60 c[0]=a[0]+b[0]; 61 a++; b++; c++; 62 } 63 } 64 65 template <class T> 66 inline void vec_sub_vec(const T *a, const T *b, T *c, int len) 67 { 68 const T *end = a+len; 69 while (a<end-3) 70 { 71 c[0]=a[0]-b[0]; 72 c[1]=a[1]-b[1]; 73 c[2]=a[2]-b[2]; 74 c[3]=a[3]-b[3]; 75 a+=4; 76 b+=4; 77 c+=4; 78 } 79 while (a<end) 80 { 81 c[0]=a[0]-b[0]; 82 a++; b++; c++; 83 } 84 } 85 86 template <class T> 87 inline void vec_mul_vec(const T *a, const T *b, T *c, int len) 88 { 89 const T *end = a+len; 90 while (a<end-3) 91 { 92 c[0]=a[0]*b[0]; 93 c[1]=a[1]*b[1]; 94 c[2]=a[2]*b[2]; 95 c[3]=a[3]*b[3]; 96 a+=4; 97 b+=4; 98 c+=4; 99 } 100 while (a<end) 101 { 102 c[0]=a[0]*b[0]; 103 a++; b++; c++; 104 } 105 } 106 107 template <class T> 108 inline void vec_mul_and_add(const T *a, const T *b, T *c, int len) 109 { 110 const T *end = a+len; 111 while (a<end-3) 112 { 113 c[0]+=a[0]*b[0]; 114 c[1]+=a[1]*b[1]; 115 c[2]+=a[2]*b[2]; 116 c[3]+=a[3]*b[3]; 117 a+=4; 118 b+=4; 119 c+=4; 120 } 121 while (a<end) 122 { 123 c[0]+=a[0]*b[0]; 124 a++; b++; c++; 125 } 126 } 127 128 template <class T> 129 inline void vec_mul_and_add(const T a, const T *b, T *c, int len) 130 { 131 const T *end = b+len; 132 while (b<end-3) 133 { 134 c[0]+=a*b[0]; 135 c[1]+=a*b[1]; 136 c[2]+=a*b[2]; 137 c[3]+=a*b[3]; 138 b+=4; 139 c+=4; 140 } 141 while (b<end) 142 { 143 c[0]+=a*b[0]; 144 b++; c++; 145 } 146 } 147 148 template <class T> 149 inline void vec_div_vec(const T *a, const T *b, T *c, int len) 150 { 151 const T *end = a+len; 152 while (a<end-3) 153 { 154 c[0]=a[0]/b[0]; 155 c[1]=a[1]/b[1]; 156 c[2]=a[2]/b[2]; 157 c[3]=a[3]/b[3]; 158 a+=4; 159 b+=4; 160 c+=4; 161 } 162 while (a<end) 163 { 164 c[0]=a[0]/b[0]; 165 a++; b++; c++; 166 } 167 } 168 169 170 template <class T> 171 inline void vec_add_scal(const T a, const T *b, T *c, int len) 172 { 173 const T *end = b+len; 174 while (b<end-3) 175 { 176 c[0]=a+b[0]; 177 c[1]=a+b[1]; 178 c[2]=a+b[2]; 179 c[3]=a+b[3]; 180 b+=4; 181 c+=4; 182 } 183 while (b<end) 184 { 185 c[0]=a+b[0]; 186 b++; c++; 187 } 188 } 189 190 template <class T> 191 inline void vec_mul_scal(const T a, const T *b, T *c, int len) 192 { 193 const T *end = b+len; 194 while (b<end-3) 195 { 196 c[0]=a*b[0]; 197 c[1]=a*b[1]; 198 c[2]=a*b[2]; 199 c[3]=a*b[3]; 200 b+=4; 201 c+=4; 202 } 203 while (b<end) 204 { 205 c[0]=a*b[0]; 206 b++; c++; 207 } 208 } 209 210 template <class T> 211 inline T vec_dist2(const T *a, const T *b, int len) 212 { 213 T sum1=0, sum2=0, sum3=0, sum4=0; 214 const T *end = a+len; 215 while (a<end-3) 216 { 217 sum1+=(a[0]-b[0])*(a[0]-b[0]); 218 sum2+=(a[1]-b[1])*(a[1]-b[1]); 219 sum3+=(a[2]-b[2])*(a[2]-b[2]); 220 sum4+=(a[3]-b[3])*(a[3]-b[3]); 221 a+=4; 222 b+=4; 223 } 224 while (a<end) 225 { 226 sum1+=(a[0]-b[0])*(a[0]-b[0]); 227 a++; b++; 228 } 229 return (sum1+sum2)+(sum3+sum4); 230 } 231 232 template <class T> 233 inline T vec_mahalanobis2(const T *a, const T *b, const T *c, int len) 234 { 235 T sum1=0, sum2=0, sum3=0, sum4=0; 236 const T *end = a+len; 237 while (a<end-3) 238 { 239 sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]); 240 sum2+=c[1]*(a[1]-b[1])*(a[1]-b[1]); 241 sum3+=c[2]*(a[2]-b[2])*(a[2]-b[2]); 242 sum4+=c[3]*(a[3]-b[3])*(a[3]-b[3]); 243 a+=4; 244 b+=4; 245 } 246 while (a<end) 247 { 248 sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]); 249 a++; b++; 250 } 251 return (sum1+sum2)+(sum3+sum4); 252 } 253 254 template <class T> 255 inline T vec_sum(const T *a, int len) 256 { 257 T sum1=0, sum2=0, sum3=0, sum4=0; 258 const T *end = a+len; 259 while (a<end-3) 260 { 261 sum1+=a[0]; 262 sum2+=a[1]; 263 sum3+=a[2]; 264 sum4+=a[3]; 265 a+=4; 266 } 267 while (a<end) 268 { 269 sum1+=a[0]; 270 a++; 271 } 272 return (sum1+sum2)+(sum3+sum4); 273 } 274 275 template <class T> 276 inline T vec_norm2(const T *a, int len) 277 { 278 T sum1=0, sum2=0, sum3=0, sum4=0; 279 const T *end = a+len; 280 while (a<end-3) 281 { 282 sum1+=a[0]*a[0]; 283 sum2+=a[1]*a[0]; 284 sum3+=a[2]*a[0]; 285 sum4+=a[3]*a[0]; 286 a+=4; 287 } 288 while (a<end) 289 { 290 sum1+=a[0]*a[0]; 291 a++; 292 } 293 return (sum1+sum2)+(sum3+sum4); 294 } 295 296 template <class T> 297 inline void vec_inv(const T *a, T *b, int len) 298 { 299 const T *end = a+len; 300 while (a<end-3) 301 { 302 b[0]=1/a[0]; 303 b[1]=1/a[1]; 304 b[2]=1/a[2]; 305 b[3]=1/a[3]; 306 a+=4; b+=4; 307 } 308 while (a<end) 309 { 310 b[0]=1/a[0]; 311 a++; b++; 312 } 313 } 314 315 template <class T> 316 inline void vec_sqrt(const T *a, T *b, int len) 317 { 318 const T *end = a+len; 319 while (a<end-3) 320 { 321 b[0]=sqrt(a[0]); 322 b[1]=sqrt(a[1]); 323 b[2]=sqrt(a[2]); 324 b[3]=sqrt(a[3]); 325 a+=4; b+=4; 326 } 327 while (a<end) 328 { 329 b[0]=sqrt(a[0]); 330 a++; b++; 331 } 332 } 333 334 #ifdef _USE_3DNOW 335 336 #define FP_DIRTY : "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", "memory" 337 338 339 template <> 340 inline float vec_inner_prod<float>(const float *a, const float *b, int len) 341 { 342 //float sum=0; 343 float sum[2]={0,0}; 344 __asm__ __volatile__ ( 345 " 346 push %%eax 347 push %%edi 348 push %%ecx 349 pxor %%mm4, %%mm4 350 pxor %%mm5, %%mm5 351 352 sub $4, %%ecx 353 //jb mul4_skip 354 jb .+47 355 356 //mul4_loop: 357 movq (%%eax), %%mm0 358 movq (%%edi), %%mm1 359 movq 8(%%eax), %%mm2 360 movq 8(%%edi), %%mm3 361 add $16, %%eax 362 add $16, %%edi 363 pfmul %%mm0, %%mm1 364 pfmul %%mm2, %%mm3 365 pfadd %%mm1, %%mm4 366 pfadd %%mm3, %%mm5 367 368 sub $4, %%ecx 369 370 //jae mul4_loop 371 jae .-39 372 373 pfadd %%mm5,%%mm4 374 375 //mul4_skip: 376 377 add $2, %%ecx 378 //jae mul2_skip 379 jae .+22 380 381 movq (%%eax), %%mm0 382 movq (%%edi), %%mm1 383 add $8, %%eax 384 add $8, %%edi 385 pfmul %%mm0, %%mm1 386 pfadd %%mm1, %%mm4 387 //mul2_skip: 388 389 and $1, %%ecx 390 //jz even 391 jz .+22 392 393 pxor %%mm0, %%mm0 394 pxor %%mm1, %%mm1 395 movd (%%eax), %%mm0 396 movd (%%edi), %%mm1 397 pfmul %%mm0, %%mm1 398 pfadd %%mm1, %%mm4 399 //even: 400 401 pxor %%mm5, %%mm5 402 pfacc %%mm5, %%mm4 403 movq %%mm4, (%%edx) 404 405 pop %%ecx 406 pop %%edi 407 pop %%eax 408 emms 409 " 410 : : "a" (a), "D" (b), "c" (len), "d" (sum) 411 FP_DIRTY 412 ); 413 414 return sum[0]; 415 } 416 417 418 template <> 419 inline void vec_add_vec<float>(const float *a, const float *b, float *c, int len) 420 { 421 __asm__ __volatile__ ( 422 " 423 push %%eax 424 push %%edi 425 push %%ecx 426 push %%edx 427 428 sub $4, %%ecx 429 //jb mul4_skip 430 jb .+45 431 432 //mul4_loop: 433 movq (%%eax), %%mm0 434 movq (%%edi), %%mm1 435 pfadd %%mm0, %%mm1 436 movq 8(%%eax), %%mm2 437 movq 8(%%edi), %%mm3 438 pfadd %%mm2, %%mm3 439 440 movq %%mm1, (%%edx) 441 movq %%mm3, 8(%%edx) 442 add $16, %%eax 443 add $16, %%edi 444 add $16, %%edx 445 sub $4, %%ecx 446 447 //jae mul4_loop 448 jae .-41 449 450 //mul4_skip: 451 452 add $2, %%ecx 453 //jae mul2_skip 454 jae .+24 455 456 movq (%%eax), %%mm0 457 movq (%%edi), %%mm1 458 pfadd %%mm0, %%mm1 459 movq %%mm1, (%%edx) 460 add $8, %%eax 461 add $8, %%edi 462 add $8, %%edx 463 464 //mul2_skip: 465 466 and $1, %%ecx 467 //jz even 468 jz .+21 469 470 pxor %%mm0, %%mm0 471 pxor %%mm1, %%mm1 472 movd (%%eax), %%mm0 473 movd (%%edi), %%mm1 474 pfadd %%mm0, %%mm1 475 movd %%mm1, (%%edx) 476 //even: 477 478 pop %%edx 479 pop %%ecx 480 pop %%edi 481 pop %%eax 482 emms 483 " 484 : : "a" (a), "D" (b), "c" (len), "d" (c) 485 FP_DIRTY 486 ); 487 } 488 489 490 template <> 491 inline void vec_sub_vec<float>(const float *a, const float *b, float *c, int len) 492 { 493 __asm__ __volatile__ ( 494 " 495 push %%eax 496 push %%edi 497 push %%ecx 498 push %%edx 499 500 sub $4, %%ecx 501 //jb mul4_skip 502 jb .+45 503 504 //mul4_loop: 505 movq (%%eax), %%mm0 506 movq (%%edi), %%mm1 507 pfsubr %%mm0, %%mm1 508 movq 8(%%eax), %%mm2 509 movq 8(%%edi), %%mm3 510 pfsubr %%mm2, %%mm3 511 512 movq %%mm1, (%%edx) 513 movq %%mm3, 8(%%edx) 514 add $16, %%eax 515 add $16, %%edi 516 add $16, %%edx 517 sub $4, %%ecx 518 519 //jae mul4_loop 520 jae .-41 521 522 //mul4_skip: 523 524 add $2, %%ecx 525 //jae mul2_skip 526 jae .+24 527 528 movq (%%eax), %%mm0 529 movq (%%edi), %%mm1 530 pfsubr %%mm0, %%mm1 531 movq %%mm1, (%%edx) 532 add $8, %%eax 533 add $8, %%edi 534 add $8, %%edx 535 536 //mul2_skip: 537 538 and $1, %%ecx 539 //jz even 540 jz .+21 541 542 pxor %%mm0, %%mm0 543 pxor