00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024
00025 #ifdef __SSE2__
00026 #include <xmmintrin.h>
00027 #include <emmintrin.h>
00028 #endif
00029
00030 #include <flgrCoreDispatch.h>
00031
00032 #include "flgrMeasureBaseSSE2.h"
00033
00034
00035
00036 void flgr1d_measure_min_max_spp1_SSE2_fgUINT8(FLGR_Data1D *dat, FLGR_Vector *mini, FLGR_Vector *maxi) {
00037 #ifdef __SSE2__
00038 fgUINT8 array_min[16] __attribute__ ((aligned (16)));
00039 fgUINT8 array_max[16] __attribute__ ((aligned (16)));
00040 fgUINT8 _mini, _maxi;
00041 fgUINT8 *psrc = (fgUINT8*) dat->array;
00042 __m128i *vec_psrc = (__m128i*) dat->array;
00043 __m128i rmax,rmin;
00044 int i, nbvector = dat->length>>4;
00045
00046
00047
00048 rmin = vec_psrc[0];
00049 rmax = vec_psrc[0];
00050
00051 for(i=1 ; i<nbvector ; i++) {
00052 rmin = _mm_min_epu8(rmin,vec_psrc[i]);
00053 rmax = _mm_max_epu8(rmax,vec_psrc[i]);
00054 }
00055
00056 *((__m128i*) array_min) = rmin;
00057 *((__m128i*) array_max) = rmax;
00058
00059 _mini = array_min[0];
00060 _maxi = array_max[0];
00061
00062 for(i=1 ; i<16 ; i++) {
00063 _mini = FLGR_MIN(_mini,array_min[i]);
00064 _maxi = FLGR_MAX(_maxi,array_max[i]);
00065 }
00066
00067 for(i=nbvector<<4 ; i<dat->length ; i++) {
00068 _mini = FLGR_MIN(psrc[i],_mini);
00069 _maxi = FLGR_MAX(psrc[i],_maxi);
00070 }
00071
00072 *((fgUINT8*) mini->array) = _mini;
00073 *((fgUINT8*) maxi->array) = _maxi;
00074 #endif
00075 }
00076
00077
00078 void flgr1d_measure_min_max_spp1_SSE2_fgINT16(FLGR_Data1D *dat, FLGR_Vector *mini, FLGR_Vector *maxi) {
00079 #ifdef __SSE2__
00080 fgINT16 array_min[8] __attribute__ ((aligned (16)));
00081 fgINT16 array_max[8] __attribute__ ((aligned (16)));
00082 fgINT16 _mini, _maxi;
00083 fgINT16 *psrc = (fgINT16*) dat->array;
00084 __m128i *vec_psrc = (__m128i*) dat->array;
00085 __m128i rmax,rmin;
00086 int i, nbvector = dat->length>>3;
00087
00088
00089
00090 rmin = vec_psrc[0];
00091 rmax = vec_psrc[0];
00092
00093 for(i=1 ; i<nbvector ; i++) {
00094 rmin = _mm_min_epi16(rmin,vec_psrc[i]);
00095 rmax = _mm_max_epi16(rmax,vec_psrc[i]);
00096 }
00097
00098 *((__m128i*) array_min) = rmin;
00099 *((__m128i*) array_max) = rmax;
00100
00101 _mini = array_min[0];
00102 _maxi = array_max[0];
00103
00104 for(i=1 ; i<8 ; i++) {
00105 _mini = FLGR_MIN(_mini,array_min[i]);
00106 _maxi = FLGR_MAX(_maxi,array_max[i]);
00107 }
00108
00109 for(i=nbvector<<3 ; i<dat->length ; i++) {
00110 _mini = FLGR_MIN(psrc[i],_mini);
00111 _maxi = FLGR_MAX(psrc[i],_maxi);
00112 }
00113
00114 *((fgINT16*) mini->array) = _mini;
00115 *((fgINT16*) maxi->array) = _maxi;
00116 #endif
00117 }
00118
00119
00120
00121 void flgr1d_measure_volume_u32_spp1_SSE2_fgUINT8(FLGR_Data1D *dat, FLGR_Vector *volume) {
00122 #ifdef __SSE2__
00123 fgUINT32 array_vol[4] __attribute__ ((aligned (16)));
00124 fgUINT32 _volume=0;
00125 fgUINT8 *psrc = (fgUINT8*) dat->array;
00126 __m128i *vec_psrc = (__m128i*) dat->array;
00127 __m128i rvol;
00128 __m128i v1, v2;
00129 __m128i v3;
00130 __m128i vec_zero;
00131 int i, nbvector = dat->length>>4;
00132
00133
00134
00135 v1=vec_psrc[0];
00136 vec_zero = _mm_xor_si128( v1, v1 );
00137
00138 rvol = vec_zero;
00139
00140
00141 for(i=0 ; i<nbvector ; i++) {
00142 v2 = vec_psrc[i];
00143 v1 = _mm_unpackhi_epi8(v2,vec_zero);
00144 v2 = _mm_unpacklo_epi8(v2,vec_zero);
00145
00146 v3 = _mm_unpackhi_epi16(v1,vec_zero);
00147 v1 = _mm_unpacklo_epi16(v1,vec_zero);
00148 v1 = _mm_add_epi32(v1,v3);
00149
00150 v3 = _mm_unpackhi_epi16(v2,vec_zero);
00151 v2 = _mm_unpacklo_epi16(v2,vec_zero);
00152 v2 = _mm_add_epi32(v2,v3);
00153
00154 v1 = _mm_add_epi32(v1,v2);
00155
00156 rvol = _mm_add_epi32(v1,rvol);
00157 }
00158
00159 *((__m128i*) array_vol) = rvol;
00160
00161 for(i=0 ; i<4 ; i++) {
00162 _volume += array_vol[i];
00163 }
00164
00165 for(i=nbvector<<4 ; i<dat->length ; i++) {
00166 _volume += (fgUINT32) psrc[i];
00167 }
00168
00169 *((fgUINT32*) volume->array) = _volume;
00170 #endif
00171 }
00172
00173
00174
00175 void flgr1d_measure_volume_u64_spp1_SSE2_fgUINT8(FLGR_Data1D *dat, FLGR_Vector *volume) {
00176 #ifdef __SSE2__
00177 fgUINT64 array_vol[2] __attribute__ ((aligned (16)));
00178 fgUINT64 _volume;
00179 fgUINT8 *psrc = (fgUINT8*) dat->array;
00180 __m128i *vec_psrc = (__m128i*) dat->array;
00181 __m128i rvol;
00182 __m128i rsrc;
00183 __m128i v1, v2;
00184 __m128i v1_1, v1_2;
00185 __m128i v2_1, v2_2;
00186 __m128i vt1, vt2;
00187 __m128i vec_zero;
00188 int i, nbvector = dat->length>>4;
00189
00190
00191
00192 v1=vec_psrc[0];
00193 vec_zero = _mm_xor_si128( v1, v1 );
00194
00195
00196 rvol = vec_zero;
00197
00198
00199 for(i=0 ; i<nbvector ; i++) {
00200 rsrc = vec_psrc[i];
00201 v1 = _mm_unpackhi_epi8(rsrc,vec_zero);
00202 v2 = _mm_unpacklo_epi8(rsrc,vec_zero);
00203
00204 v1_1 = _mm_unpackhi_epi16(v1,vec_zero);
00205 v1_2 = _mm_unpacklo_epi16(v1,vec_zero);
00206
00207 v2_1 = _mm_unpackhi_epi16(v2,vec_zero);
00208 v2_2 = _mm_unpacklo_epi16(v2,vec_zero);
00209
00210 vt1 = _mm_unpackhi_epi32(v1_1,vec_zero);
00211 vt2 = _mm_unpacklo_epi32(v1_1,vec_zero);
00212 v1_1 = _mm_add_epi64(vt1,vt2);
00213
00214 vt1 = _mm_unpackhi_epi32(v1_2,vec_zero);
00215 vt2 = _mm_unpacklo_epi32(v1_2,vec_zero);
00216 v1_2 = _mm_add_epi64(vt1,vt2);
00217
00218 vt1 = _mm_unpackhi_epi32(v2_1,vec_zero);
00219 vt2 = _mm_unpacklo_epi32(v2_1,vec_zero);
00220 v2_1 = _mm_add_epi64(vt1,vt2);
00221
00222 vt1 = _mm_unpackhi_epi32(v2_2,vec_zero);
00223 vt2 = _mm_unpacklo_epi32(v2_2,vec_zero);
00224 v2_2 = _mm_add_epi64(vt1,vt2);
00225
00226 v1 = _mm_add_epi64(v1_1,v1_2);
00227 v2 = _mm_add_epi64(v2_1,v2_2);
00228
00229 v1 = _mm_add_epi64(v1,v2);
00230
00231 rvol = _mm_add_epi64(v1,rvol);
00232 }
00233
00234 *((__m128i*) array_vol) = rvol;
00235
00236 _volume = array_vol[0]+array_vol[1];
00237
00238 for(i=nbvector<<4 ; i<dat->length ; i++) {
00239 _volume += (fgUINT64) psrc[i];
00240 }
00241
00242 *((fgUINT64*) volume->array) = _volume;
00243 #endif
00244 }
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261 void flgr1d_measure_volume_u32_spp1_SSE2_fgUINT16(FLGR_Data1D *dat, FLGR_Vector *volume) {
00262 #ifdef __SSE2__
00263 fgUINT32 array_vol[4] __attribute__ ((aligned (16)));
00264 fgUINT32 _volume=0;
00265 fgUINT16 *psrc = (fgUINT16*) dat->array;
00266 __m128i *vec_psrc = (__m128i*) dat->array;
00267 __m128i rvol;
00268 __m128i rsrc;
00269 __m128i v1, v2;
00270 __m128i vec_zero;
00271 int i, nbvector = dat->length>>3;
00272
00273
00274
00275 v1=vec_psrc[0];
00276 vec_zero = _mm_xor_si128( v1, v1 );
00277
00278
00279 rvol = vec_zero;
00280
00281
00282 for(i=0 ; i<nbvector ; i++) {
00283 rsrc = vec_psrc[i];
00284 v1 = _mm_unpackhi_epi16(rsrc,vec_zero);
00285 v2 = _mm_unpacklo_epi16(rsrc,vec_zero);
00286
00287 v1 = _mm_add_epi32(v1,v2);
00288
00289 rvol = _mm_add_epi32(v1,rvol);
00290 }
00291
00292 *((__m128i*) array_vol) = rvol;
00293
00294 _volume = array_vol[0];
00295 _volume += array_vol[1];
00296 _volume += array_vol[2];
00297 _volume += array_vol[3];
00298
00299 for(i=nbvector<<3 ; i<dat->length ; i++) {
00300 _volume += (fgUINT32) psrc[i];
00301 }
00302
00303 *((fgUINT32*) volume->array) = _volume;
00304 #endif
00305 }
00306
00307
00308
00309 void flgr1d_measure_volume_u64_spp1_SSE2_fgUINT16(FLGR_Data1D *dat, FLGR_Vector *volume) {
00310 #ifdef __SSE2__
00311 fgUINT64 array_vol[2] __attribute__ ((aligned (16)));
00312 fgUINT64 _volume;
00313 fgUINT16 *psrc = (fgUINT16*) dat->array;
00314 __m128i *vec_psrc = (__m128i*) dat->array;
00315 __m128i rvol;
00316 __m128i v1, v2;
00317 __m128i v3;
00318 __m128i vec_zero;
00319 int i, nbvector = dat->length>>3;
00320
00321
00322
00323 v1=vec_psrc[0];
00324 vec_zero = _mm_xor_si128( v1, v1 );
00325
00326 rvol = vec_zero;
00327
00328 for(i=0 ; i<nbvector ; i++) {
00329 v2 = vec_psrc[i];
00330 v1 = _mm_unpackhi_epi16(v2,vec_zero);
00331 v2 = _mm_unpacklo_epi16(v2,vec_zero);
00332
00333 v3 = _mm_unpackhi_epi32(v1,vec_zero);
00334 v1 = _mm_unpacklo_epi32(v1,vec_zero);
00335 v1 = _mm_add_epi64(v1,v3);
00336
00337 v3 = _mm_unpackhi_epi32(v2,vec_zero);
00338 v2 = _mm_unpacklo_epi32(v2,vec_zero);
00339 v2 = _mm_add_epi64(v2,v3);
00340
00341 v1 = _mm_add_epi64(v1,v2);
00342
00343 rvol = _mm_add_epi64(v1,rvol);
00344 }
00345
00346 *((__m128i*) array_vol) = rvol;
00347
00348 _volume = array_vol[0]+array_vol[1];
00349
00350 for(i=nbvector<<3 ; i<dat->length ; i++) {
00351 _volume += (fgUINT64) psrc[i];
00352 }
00353
00354 *((fgUINT64*) volume->array) = _volume;
00355 #endif
00356 }
00357
00358
00359
00360
00361
00362
00363
00364
00365 void flgr1d_measure_volume_u32_spp1_SSE2_fgINT16(FLGR_Data1D *dat, FLGR_Vector *volume) {
00366 #ifdef __SSE2__
00367 fgUINT32 array_vol[4] __attribute__ ((aligned (16)));
00368 fgUINT32 _volume=0;
00369 fgINT16 *psrc = (fgINT16*) dat->array;
00370 __m128i *vec_psrc = (__m128i*) dat->array;
00371 __m128i rvol;
00372 __m128i rsrc;
00373 __m128i v1, v2;
00374 __m128i vec_zero;
00375 int i, nbvector = dat->length>>3;
00376
00377
00378
00379 v1=vec_psrc[0];
00380 vec_zero = _mm_xor_si128( v1, v1 );
00381
00382
00383 rvol = vec_zero;
00384
00385
00386 for(i=0 ; i<nbvector ; i++) {
00387 rsrc = vec_psrc[i];
00388 v1 = _mm_unpackhi_epi16(rsrc,vec_zero);
00389 v2 = _mm_unpacklo_epi16(rsrc,vec_zero);
00390
00391 v1 = _mm_add_epi32(v1,v2);
00392
00393 rvol = _mm_add_epi32(v1,rvol);
00394 }
00395
00396 *((__m128i*) array_vol) = rvol;
00397
00398 _volume = array_vol[0];
00399 _volume += array_vol[1];
00400 _volume += array_vol[2];
00401 _volume += array_vol[3];
00402
00403 for(i=nbvector<<3 ; i<dat->length ; i++) {
00404 _volume += (fgUINT32) psrc[i];
00405 }
00406
00407 *((fgUINT32*) volume->array) = _volume;
00408 #endif
00409 }
00410
00411
00412
00413
00414
00415
00416 void flgr1d_measure_volume_s32_spp1_SSE2_fgINT16(FLGR_Data1D *dat, FLGR_Vector *volume) {
00417 #ifdef __SSE2__
00418 fgINT32 array_vol[4] __attribute__ ((aligned (16)));
00419 fgINT32 _volume=0;
00420 fgINT16 *psrc = (fgINT16*) dat->array;
00421 __m128i *vec_psrc = (__m128i*) dat->array;
00422 __m128i rvol;
00423 __m128i v1, v2;
00424 int i, nbvector = dat->length>>3;
00425
00426
00427
00428
00429 v2 = vec_psrc[0];
00430 rvol = _mm_xor_si128( v2, v2 );
00431
00432
00433 for(i=0 ; i<nbvector ; i++) {
00434 v2 = vec_psrc[i];
00435
00436 v1 = _mm_unpackhi_epi16(v2,v2);
00437 v1 = _mm_srai_epi32( v1, 16 );
00438
00439 v2 = _mm_unpacklo_epi16(v2,v2);
00440 v2 = _mm_srai_epi32( v2, 16 );
00441
00442 v1 = _mm_add_epi32(v1,v2);
00443
00444 rvol = _mm_add_epi32(v1,rvol);
00445 }
00446
00447 *((__m128i*) array_vol) = rvol;
00448
00449 _volume = array_vol[0];
00450 _volume += array_vol[1];
00451 _volume += array_vol[2];
00452 _volume += array_vol[3];
00453
00454 for(i=nbvector<<3 ; i<dat->length ; i++) {
00455 _volume += (fgINT32) psrc[i];
00456 }
00457
00458 *((fgINT32*) volume->array) = _volume;
00459 #endif
00460 }
00461
00462 void flgr1d_measure_volume_u64_spp1_SSE2_fgINT16(FLGR_Data1D *dat, FLGR_Vector *volume) {
00463 #ifdef __SSE2__
00464 fgUINT64 array_vol[2] __attribute__ ((aligned (16)));
00465 fgUINT64 _volume;
00466 fgINT16 *psrc = (fgINT16*) dat->array;
00467 __m128i *vec_psrc = (__m128i*) dat->array;
00468 __m128i rvol;
00469 __m128i v1, v2;
00470 __m128i v3;
00471 __m128i vec_zero;
00472 int i, nbvector = dat->length>>3;
00473
00474
00475
00476 v1=vec_psrc[0];
00477 vec_zero = _mm_xor_si128( v1, v1 );
00478
00479 rvol = vec_zero;
00480
00481 for(i=0 ; i<nbvector ; i++) {
00482 v2 = vec_psrc[i];
00483 v1 = _mm_unpackhi_epi16(v2,vec_zero);
00484 v2 = _mm_unpacklo_epi16(v2,vec_zero);
00485
00486 v3 = _mm_unpackhi_epi32(v1,vec_zero);
00487 v1 = _mm_unpacklo_epi32(v1,vec_zero);
00488 v1 = _mm_add_epi64(v1,v3);
00489
00490 v3 = _mm_unpackhi_epi32(v2,vec_zero);
00491 v2 = _mm_unpacklo_epi32(v2,vec_zero);
00492 v2 = _mm_add_epi64(v2,v3);
00493
00494 v1 = _mm_add_epi64(v1,v2);
00495
00496 rvol = _mm_add_epi64(v1,rvol);
00497 }
00498
00499 *((__m128i*) array_vol) = rvol;
00500
00501 _volume = array_vol[0]+array_vol[1];
00502
00503 for(i=nbvector<<3 ; i<dat->length ; i++) {
00504 _volume += (fgUINT64) psrc[i];
00505 }
00506
00507 *((fgUINT64*) volume->array) = _volume;
00508 #endif
00509 }
00510
00511
00512
00513
00514 void flgr1d_measure_sad_u32_spp1_SSE2_fgUINT8(FLGR_Data1D *dat1, FLGR_Data1D *dat2, FLGR_Vector *sad) {
00515 #ifdef __SSE2__
00516 fgUINT32 array_vol[4] __attribute__ ((aligned (16)));
00517 fgUINT32 _sad=0;
00518 fgUINT8 *psrc1 = (fgUINT8*) dat1->array;
00519 fgUINT8 *psrc2 = (fgUINT8*) dat2->array;
00520 fgUINT8 val1,val2;
00521 __m128i *vec_psrc1 = (__m128i*) dat1->array;
00522 __m128i *vec_psrc2 = (__m128i*) dat2->array;
00523 __m128i rsad,v1;
00524 int i, nbvector = dat1->length>>4;
00525
00526
00527
00528 v1 = vec_psrc1[0];
00529 rsad = _mm_xor_si128( v1, v1 );
00530
00531 for(i=0 ; i<nbvector ; i++) {
00532 v1 = _mm_sad_epu8(vec_psrc1[i], vec_psrc2[i]);
00533 rsad = _mm_add_epi32(v1,rsad);
00534 }
00535
00536 *((__m128i*) array_vol) = rsad;
00537
00538 for(i=0 ; i<4 ; i+=2) {
00539 _sad += array_vol[i];
00540 }
00541
00542 for(i=nbvector<<4 ; i<dat1->length ; i++) {
00543 val1 = psrc1[i];
00544 val2 = psrc2[i];
00545 _sad += (fgUINT32) ((val1>val2) ? (val1-val2) : (val2-val1));
00546 }
00547
00548 *((fgUINT32*) sad->array) = _sad;
00549 #endif
00550 }
00551
00552