1 #include <math.h>
2
3 template <class T>
4 inline void vec_copy(const T *x, T *y, int len)
5 {
6 while (len > 3)
7 {
8 *y++ = *x++;
9 *y++ = *x++;
10 *y++ = *x++;
11 *y++ = *x++;
12 len -= 4;
13 }
14 while (len)
15 {
16 *y++ = *x++;
17 len--;
18 }
19 }
20
21
22 template <class T>
23 inline T vec_inner_prod(const T *a, const T *b, int len)
24 {
25 T sum1=0, sum2=0, sum3=0, sum4=0;
26 const T *end = a+len;
27 while (a<end-3)
28 {
29 sum1+=a[0]*b[0];
30 sum2+=a[1]*b[1];
31 sum3+=a[2]*b[2];
32 sum4+=a[3]*b[3];
33 a+=4;
34 b+=4;
35 }
36 while (a<end)
37 {
38 sum1+=a[0]*b[0];
39 a++; b++;
40 }
41 return (sum1+sum2)+(sum3+sum4);
42 }
43
44 template <class T>
45 inline void vec_add_vec(const T *a, const T *b, T *c, int len)
46 {
47 const T *end = a+len;
48 while (a<end-3)
49 {
50 c[0]=a[0]+b[0];
51 c[1]=a[1]+b[1];
52 c[2]=a[2]+b[2];
53 c[3]=a[3]+b[3];
54 a+=4;
55 b+=4;
56 c+=4;
57 }
58 while (a<end)
59 {
60 c[0]=a[0]+b[0];
61 a++; b++; c++;
62 }
63 }
64
65 template <class T>
66 inline void vec_sub_vec(const T *a, const T *b, T *c, int len)
67 {
68 const T *end = a+len;
69 while (a<end-3)
70 {
71 c[0]=a[0]-b[0];
72 c[1]=a[1]-b[1];
73 c[2]=a[2]-b[2];
74 c[3]=a[3]-b[3];
75 a+=4;
76 b+=4;
77 c+=4;
78 }
79 while (a<end)
80 {
81 c[0]=a[0]-b[0];
82 a++; b++; c++;
83 }
84 }
85
86 template <class T>
87 inline void vec_mul_vec(const T *a, const T *b, T *c, int len)
88 {
89 const T *end = a+len;
90 while (a<end-3)
91 {
92 c[0]=a[0]*b[0];
93 c[1]=a[1]*b[1];
94 c[2]=a[2]*b[2];
95 c[3]=a[3]*b[3];
96 a+=4;
97 b+=4;
98 c+=4;
99 }
100 while (a<end)
101 {
102 c[0]=a[0]*b[0];
103 a++; b++; c++;
104 }
105 }
106
107 template <class T>
108 inline void vec_mul_and_add(const T *a, const T *b, T *c, int len)
109 {
110 const T *end = a+len;
111 while (a<end-3)
112 {
113 c[0]+=a[0]*b[0];
114 c[1]+=a[1]*b[1];
115 c[2]+=a[2]*b[2];
116 c[3]+=a[3]*b[3];
117 a+=4;
118 b+=4;
119 c+=4;
120 }
121 while (a<end)
122 {
123 c[0]+=a[0]*b[0];
124 a++; b++; c++;
125 }
126 }
127
128 template <class T>
129 inline void vec_mul_and_add(const T a, const T *b, T *c, int len)
130 {
131 const T *end = b+len;
132 while (b<end-3)
133 {
134 c[0]+=a*b[0];
135 c[1]+=a*b[1];
136 c[2]+=a*b[2];
137 c[3]+=a*b[3];
138 b+=4;
139 c+=4;
140 }
141 while (b<end)
142 {
143 c[0]+=a*b[0];
144 b++; c++;
145 }
146 }
147
148 template <class T>
149 inline void vec_div_vec(const T *a, const T *b, T *c, int len)
150 {
151 const T *end = a+len;
152 while (a<end-3)
153 {
154 c[0]=a[0]/b[0];
155 c[1]=a[1]/b[1];
156 c[2]=a[2]/b[2];
157 c[3]=a[3]/b[3];
158 a+=4;
159 b+=4;
160 c+=4;
161 }
162 while (a<end)
163 {
164 c[0]=a[0]/b[0];
165 a++; b++; c++;
166 }
167 }
168
169
170 template <class T>
171 inline void vec_add_scal(const T a, const T *b, T *c, int len)
172 {
173 const T *end = b+len;
174 while (b<end-3)
175 {
176 c[0]=a+b[0];
177 c[1]=a+b[1];
178 c[2]=a+b[2];
179 c[3]=a+b[3];
180 b+=4;
181 c+=4;
182 }
183 while (b<end)
184 {
185 c[0]=a+b[0];
186 b++; c++;
187 }
188 }
189
190 template <class T>
191 inline void vec_mul_scal(const T a, const T *b, T *c, int len)
192 {
193 const T *end = b+len;
194 while (b<end-3)
195 {
196 c[0]=a*b[0];
197 c[1]=a*b[1];
198 c[2]=a*b[2];
199 c[3]=a*b[3];
200 b+=4;
201 c+=4;
202 }
203 while (b<end)
204 {
205 c[0]=a*b[0];
206 b++; c++;
207 }
208 }
209
210 template <class T>
211 inline T vec_dist2(const T *a, const T *b, int len)
212 {
213 T sum1=0, sum2=0, sum3=0, sum4=0;
214 const T *end = a+len;
215 while (a<end-3)
216 {
217 sum1+=(a[0]-b[0])*(a[0]-b[0]);
218 sum2+=(a[1]-b[1])*(a[1]-b[1]);
219 sum3+=(a[2]-b[2])*(a[2]-b[2]);
220 sum4+=(a[3]-b[3])*(a[3]-b[3]);
221 a+=4;
222 b+=4;
223 }
224 while (a<end)
225 {
226 sum1+=(a[0]-b[0])*(a[0]-b[0]);
227 a++; b++;
228 }
229 return (sum1+sum2)+(sum3+sum4);
230 }
231
232 template <class T>
233 inline T vec_mahalanobis2(const T *a, const T *b, const T *c, int len)
234 {
235 T sum1=0, sum2=0, sum3=0, sum4=0;
236 const T *end = a+len;
237 while (a<end-3)
238 {
239 sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]);
240 sum2+=c[1]*(a[1]-b[1])*(a[1]-b[1]);
241 sum3+=c[2]*(a[2]-b[2])*(a[2]-b[2]);
242 sum4+=c[3]*(a[3]-b[3])*(a[3]-b[3]);
243 a+=4;
244 b+=4;
245 }
246 while (a<end)
247 {
248 sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]);
249 a++; b++;
250 }
251 return (sum1+sum2)+(sum3+sum4);
252 }
253
254 template <class T>
255 inline T vec_sum(const T *a, int len)
256 {
257 T sum1=0, sum2=0, sum3=0, sum4=0;
258 const T *end = a+len;
259 while (a<end-3)
260 {
261 sum1+=a[0];
262 sum2+=a[1];
263 sum3+=a[2];
264 sum4+=a[3];
265 a+=4;
266 }
267 while (a<end)
268 {
269 sum1+=a[0];
270 a++;
271 }
272 return (sum1+sum2)+(sum3+sum4);
273 }
274
275 template <class T>
276 inline T vec_norm2(const T *a, int len)
277 {
278 T sum1=0, sum2=0, sum3=0, sum4=0;
279 const T *end = a+len;
280 while (a<end-3)
281 {
282 sum1+=a[0]*a[0];
283 sum2+=a[1]*a[0];
284 sum3+=a[2]*a[0];
285 sum4+=a[3]*a[0];
286 a+=4;
287 }
288 while (a<end)
289 {
290 sum1+=a[0]*a[0];
291 a++;
292 }
293 return (sum1+sum2)+(sum3+sum4);
294 }
295
296 template <class T>
297 inline void vec_inv(const T *a, T *b, int len)
298 {
299 const T *end = a+len;
300 while (a<end-3)
301 {
302 b[0]=1/a[0];
303 b[1]=1/a[1];
304 b[2]=1/a[2];
305 b[3]=1/a[3];
306 a+=4; b+=4;
307 }
308 while (a<end)
309 {
310 b[0]=1/a[0];
311 a++; b++;
312 }
313 }
314
315 template <class T>
316 inline void vec_sqrt(const T *a, T *b, int len)
317 {
318 const T *end = a+len;
319 while (a<end-3)
320 {
321 b[0]=sqrt(a[0]);
322 b[1]=sqrt(a[1]);
323 b[2]=sqrt(a[2]);
324 b[3]=sqrt(a[3]);
325 a+=4; b+=4;
326 }
327 while (a<end)
328 {
329 b[0]=sqrt(a[0]);
330 a++; b++;
331 }
332 }
333
334 #ifdef _USE_3DNOW
335
336 #define FP_DIRTY : "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", "memory"
337
338
339 template <>
340 inline float vec_inner_prod<float>(const float *a, const float *b, int len)
341 {
342 //float sum=0;
343 float sum[2]={0,0};
344 __asm__ __volatile__ (
345 "
346 push %%eax
347 push %%edi
348 push %%ecx
349 pxor %%mm4, %%mm4
350 pxor %%mm5, %%mm5
351
352 sub $4, %%ecx
353 //jb mul4_skip
354 jb .+47
355
356 //mul4_loop:
357 movq (%%eax), %%mm0
358 movq (%%edi), %%mm1
359 movq 8(%%eax), %%mm2
360 movq 8(%%edi), %%mm3
361 add $16, %%eax
362 add $16, %%edi
363 pfmul %%mm0, %%mm1
364 pfmul %%mm2, %%mm3
365 pfadd %%mm1, %%mm4
366 pfadd %%mm3, %%mm5
367
368 sub $4, %%ecx
369
370 //jae mul4_loop
371 jae .-39
372
373 pfadd %%mm5,%%mm4
374
375 //mul4_skip:
376
377 add $2, %%ecx
378 //jae mul2_skip
379 jae .+22
380
381 movq (%%eax), %%mm0
382 movq (%%edi), %%mm1
383 add $8, %%eax
384 add $8, %%edi
385 pfmul %%mm0, %%mm1
386 pfadd %%mm1, %%mm4
387 //mul2_skip:
388
389 and $1, %%ecx
390 //jz even
391 jz .+22
392
393 pxor %%mm0, %%mm0
394 pxor %%mm1, %%mm1
395 movd (%%eax), %%mm0
396 movd (%%edi), %%mm1
397 pfmul %%mm0, %%mm1
398 pfadd %%mm1, %%mm4
399 //even:
400
401 pxor %%mm5, %%mm5
402 pfacc %%mm5, %%mm4
403 movq %%mm4, (%%edx)
404
405 pop %%ecx
406 pop %%edi
407 pop %%eax
408 emms
409 "
410 : : "a" (a), "D" (b), "c" (len), "d" (sum)
411 FP_DIRTY
412 );
413
414 return sum[0];
415 }
416
417
418 template <>
419 inline void vec_add_vec<float>(const float *a, const float *b, float *c, int len)
420 {
421 __asm__ __volatile__ (
422 "
423 push %%eax
424 push %%edi
425 push %%ecx
426 push %%edx
427
428 sub $4, %%ecx
429 //jb mul4_skip
430 jb .+45
431
432 //mul4_loop:
433 movq (%%eax), %%mm0
434 movq (%%edi), %%mm1
435 pfadd %%mm0, %%mm1
436 movq 8(%%eax), %%mm2
437 movq 8(%%edi), %%mm3
438 pfadd %%mm2, %%mm3
439
440 movq %%mm1, (%%edx)
441 movq %%mm3, 8(%%edx)
442 add $16, %%eax
443 add $16, %%edi
444 add $16, %%edx
445 sub $4, %%ecx
446
447 //jae mul4_loop
448 jae .-41
449
450 //mul4_skip:
451
452 add $2, %%ecx
453 //jae mul2_skip
454 jae .+24
455
456 movq (%%eax), %%mm0
457 movq (%%edi), %%mm1
458 pfadd %%mm0, %%mm1
459 movq %%mm1, (%%edx)
460 add $8, %%eax
461 add $8, %%edi
462 add $8, %%edx
463
464 //mul2_skip:
465
466 and $1, %%ecx
467 //jz even
468 jz .+21
469
470 pxor %%mm0, %%mm0
471 pxor %%mm1, %%mm1
472 movd (%%eax), %%mm0
473 movd (%%edi), %%mm1
474 pfadd %%mm0, %%mm1
475 movd %%mm1, (%%edx)
476 //even:
477
478 pop %%edx
479 pop %%ecx
480 pop %%edi
481 pop %%eax
482 emms
483 "
484 : : "a" (a), "D" (b), "c" (len), "d" (c)
485 FP_DIRTY
486 );
487 }
488
489
490 template <>
491 inline void vec_sub_vec<float>(const float *a, const float *b, float *c, int len)
492 {
493 __asm__ __volatile__ (
494 "
495 push %%eax
496 push %%edi
497 push %%ecx
498 push %%edx
499
500 sub $4, %%ecx
501 //jb mul4_skip
502 jb .+45
503
504 //mul4_loop:
505 movq (%%eax), %%mm0
506 movq (%%edi), %%mm1
507 pfsubr %%mm0, %%mm1
508 movq 8(%%eax), %%mm2
509 movq 8(%%edi), %%mm3
510 pfsubr %%mm2, %%mm3
511
512 movq %%mm1, (%%edx)
513 movq %%mm3, 8(%%edx)
514 add $16, %%eax
515 add $16, %%edi
516 add $16, %%edx
517 sub $4, %%ecx
518
519 //jae mul4_loop
520 jae .-41
521
522 //mul4_skip:
523
524 add $2, %%ecx
525 //jae mul2_skip
526 jae .+24
527
528 movq (%%eax), %%mm0
529 movq (%%edi), %%mm1
530 pfsubr %%mm0, %%mm1
531 movq %%mm1, (%%edx)
532 add $8, %%eax
533 add $8, %%edi
534 add $8, %%edx
535
536 //mul2_skip:
537
538 and $1, %%ecx
539 //jz even
540 jz .+21
541
542 pxor %%mm0, %%mm0
543 pxor