00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <string.h>
00022 #include <flgrCoreVector.h>
00023 #include <flgrCoreDispatch.h>
00024 #include <flgrCoreArithDef.h>
00025 #include <flgrCoreIO.h>
00026 #include "flgrArithShiftFast.h"
00027
00028 #ifdef __SSE3__
00029 #include <pmmintrin.h>
00030 #endif
00031
00032 #ifdef __SSE2__
00033 #include <emmintrin.h>
00034 #include <xmmintrin.h>
00035 #endif
00036
00037 #if defined(__SSE__)
00038 #include <xmmintrin.h>
00039 #endif
00040
00041 #if defined(__MMX__)
00042 #include <mmintrin.h>
00043 #endif
00044
00045
00046 #ifdef __SSE3__
00047 #define vector128_load_unalign(x) _mm_lddqu_si128(x)
00048 #else
00049 #ifdef __SSE2__
00050 #define vector128_load_unalign(x) _mm_loadu_si128(x)
00051 #endif
00052 #endif
00053
00054
00055
00056
00057
00058
00060
00062
00063 #define tbit fgBIT
00064 #define set(x) ((fgBIT) (x));
00065 #define shiftr(a,x) ((a)>>(x))
00066 #define shiftl(a,x) ((a)<<(x))
00067 #define and(a,b) ((a) & (b))
00068 #define or(a,b) ((a) | (b))
00069 #define empty()
00070
00071 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_BIT(op,border) \
00072 tbit *pdest = (tbit*) datdest->array; \
00073 tbit *psrc = (tbit*) data1->array; \
00074 tbit r0; \
00075 tbit r1; \
00076 tbit r2; \
00077 tbit rmask; \
00078 int i; \
00079 int nbvec = datdest->length / (sizeof(tbit)*8); \
00080 \
00081 r2 = set( ((fgBIT) border)<<(sizeof(tbit)*8-1) ); \
00082 rmask = set(1); \
00083 \
00084 for(i=0 ; i<=nbvec; i++, psrc++, pdest++) { \
00085 r0 = *psrc; \
00086 r1 = shiftr(r0,1); \
00087 r1 = or(r2,r1); \
00088 r1 = op(r1,r0); \
00089 r2 = and(r0,rmask); \
00090 r2 = shiftl(r2,(sizeof(tbit)*8-1)); \
00091 *pdest = r1; \
00092 } \
00093 empty()
00094
00095
00096 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_BIT(op,border) \
00097 tbit *pdest = (tbit*) datdest->array; \
00098 tbit *psrc = (tbit*) data1->array; \
00099 tbit r0; \
00100 tbit r1; \
00101 tbit r2; \
00102 tbit rmask; \
00103 int i; \
00104 int nbvec = datdest->length / (sizeof(tbit)*8); \
00105 int posvec = (sizeof(tbit)*8)- \
00106 (datdest->length % (sizeof(tbit)*8)); \
00107 \
00108 rmask = set(((fgBIT) 1)<<(sizeof(tbit)*8-1)); \
00109 \
00110 r2 = set( ((fgBIT) border)<<posvec ); \
00111 \
00112 psrc += nbvec; \
00113 pdest += nbvec; \
00114 \
00115 for(i=0 ; i<=nbvec ; i++, psrc--, pdest--) { \
00116 r0 = *psrc; \
00117 r1 = shiftl(r0,1); \
00118 r1 = or(r2,r1); \
00119 r1 = op(r1,r0); \
00120 r2 = and(r0,rmask); \
00121 r2 = shiftr(r2,(sizeof(tbit)*8-1)); \
00122 *pdest = r1; \
00123 } \
00124 empty()
00125
00126
00127
00129
00131 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE(dtype,vtype, \
00132 scalar_instr, \
00133 vector_instr) \
00134 const int spp = datdest->spp; \
00135 const int sfsize = spp*right_shift_size; \
00136 const int length = datdest->length*spp; \
00137 const int vec_length = sizeof(vtype)/sizeof(dtype); \
00138 dtype *psrc1 = (dtype*) data1->array; \
00139 dtype *pdest = (dtype*) datdest->array; \
00140 dtype result; \
00141 vtype *vec_src1; \
00142 vtype *vec_src2; \
00143 vtype *vec_dest; \
00144 vtype vec1; \
00145 vtype vec2; \
00146 int j; \
00147 \
00148 \
00149 \
00150 for(j=0 ; j<sfsize ; j++) { \
00151 result = flgr_get_array_##dtype(psrc1,j); \
00152 flgr_set_array_##dtype(pdest,j,result); \
00153 } \
00154 \
00155 vec_src1 = (vtype*) (psrc1+sfsize); \
00156 vec_src2 = (vtype*) psrc1; \
00157 vec_dest = (vtype*) (pdest+sfsize); \
00158 \
00159 for(j=sfsize ; j<length ; j+=vec_length) { \
00160 vec1 = *vec_src1; \
00161 vec2 = *vec_src2; \
00162 vec1 = vector_instr(vec1,vec2); \
00163 *vec_dest = vec1; \
00164 vec_dest++;vec_src2++;vec_src1++; \
00165 } \
00166 \
00167 _mm_empty(); \
00168 return
00169
00170
00171
00172 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE(dtype,vtype, \
00173 scalar_instr, \
00174 vector_instr) \
00175 const int spp = datdest->spp; \
00176 const int sfsize = spp*left_shift_size; \
00177 const int length = datdest->length*spp; \
00178 const int vec_length = sizeof(vtype)/sizeof(dtype); \
00179 dtype *psrc1 = (dtype*) data1->array; \
00180 dtype *pdest = (dtype*) datdest->array; \
00181 dtype result; \
00182 vtype *vec_src1; \
00183 vtype *vec_src2; \
00184 vtype *vec_dest; \
00185 vtype vec1; \
00186 vtype vec2; \
00187 int j; \
00188 \
00189 \
00190 \
00191 vec_src1 = (vtype*) psrc1; \
00192 vec_src2 = (vtype*) (psrc1+sfsize); \
00193 vec_dest = (vtype*) pdest; \
00194 \
00195 for(j=0 ; j<length-sfsize ; j+=vec_length) { \
00196 vec1 = *vec_src1; \
00197 vec2 = *vec_src2; \
00198 vec1 = vector_instr(vec1,vec2); \
00199 *vec_dest = vec1; \
00200 vec_dest++;vec_src2++;vec_src1++; \
00201 } \
00202 \
00203 for(j=length-sfsize ; j<length ; j++) { \
00204 result = flgr_get_array_##dtype(psrc1,j); \
00205 flgr_set_array_##dtype(pdest,j,result); \
00206 } \
00207 \
00208 _mm_empty(); \
00209 return
00210
00211
00212
00213
00214 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE_FLOAT32(scalar_instr, \
00215 vector_instr) \
00216 const int spp = datdest->spp; \
00217 const int sfsize = spp*right_shift_size; \
00218 const int length = datdest->length*spp; \
00219 const int vec_length = sizeof(__m128)/sizeof(fgFLOAT32); \
00220 fgFLOAT32 *psrc1 = (fgFLOAT32*) data1->array; \
00221 fgFLOAT32 *pdest = (fgFLOAT32*) datdest->array; \
00222 fgFLOAT32 result; \
00223 __m128 *vec_src1; \
00224 __m128 *vec_src2; \
00225 __m128 *vec_dest; \
00226 __m128 vec1; \
00227 __m128 vec2; \
00228 int j; \
00229 \
00230 \
00231 \
00232 for(j=0 ; j<sfsize ; j++) { \
00233 result = flgr_get_array_fgFLOAT32(psrc1,j); \
00234 flgr_set_array_fgFLOAT32(pdest,j,result); \
00235 } \
00236 \
00237 vec_src1 = (__m128*) (psrc1+sfsize); \
00238 vec_src2 = (__m128*) psrc1; \
00239 vec_dest = (__m128*) (pdest+sfsize); \
00240 \
00241 for(j=sfsize ; j<length ; j+=vec_length) { \
00242 vec1 = _mm_loadu_ps((float*) vec_src1); \
00243 vec2 = _mm_loadu_ps((float*) vec_src2); \
00244 vec1 = vector_instr(vec1,vec2); \
00245 _mm_storeu_ps((float*) vec_dest,vec1); \
00246 vec_dest++;vec_src2++;vec_src1++; \
00247 } \
00248 \
00249 return
00250
00251 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE_FLOAT32(scalar_instr, \
00252 vector_instr) \
00253 const int spp = datdest->spp; \
00254 const int sfsize = spp*left_shift_size; \
00255 const int length = datdest->length*spp; \
00256 const int vec_length = sizeof(__m128)/sizeof(fgFLOAT32); \
00257 fgFLOAT32 *psrc1 = (fgFLOAT32*) data1->array; \
00258 fgFLOAT32 *pdest = (fgFLOAT32*) datdest->array; \
00259 fgFLOAT32 result; \
00260 __m128 *vec_src1; \
00261 __m128 *vec_src2; \
00262 __m128 *vec_dest; \
00263 __m128 vec1; \
00264 __m128 vec2; \
00265 int j; \
00266 \
00267 \
00268 \
00269 vec_src1 = (__m128*) psrc1; \
00270 vec_src2 = (__m128*) (psrc1+sfsize); \
00271 vec_dest = (__m128*) pdest; \
00272 \
00273 for(j=0 ; j<length-sfsize ; j+=vec_length) { \
00274 vec1 = _mm_loadu_ps((float*) vec_src1); \
00275 vec2 = _mm_loadu_ps((float*) vec_src2); \
00276 vec1 = vector_instr(vec1,vec2); \
00277 _mm_storeu_ps((float*) vec_dest,vec1); \
00278 vec_dest++;vec_src2++;vec_src1++; \
00279 } \
00280 \
00281 for(j=length-sfsize ; j<length ; j++) { \
00282 result = flgr_get_array_fgFLOAT32(psrc1,j); \
00283 flgr_set_array_fgFLOAT32(pdest,j,result); \
00284 } \
00285 \
00286 return
00287
00288
00289
00294
00299
00300
00301
00302 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2(dtype,vtype, \
00303 scalar_instr, \
00304 vector_instr) \
00305 const int spp = datdest->spp; \
00306 const int sfsize = spp*right_shift_size; \
00307 const int length = datdest->length*spp; \
00308 const int vec_length = sizeof(vtype)/sizeof(dtype); \
00309 dtype *psrc1 = (dtype*) data1->array; \
00310 dtype *pdest = (dtype*) datdest->array; \
00311 dtype result; \
00312 vtype *vec_src1; \
00313 vtype *vec_src2; \
00314 vtype *vec_dest; \
00315 vtype vec1; \
00316 vtype vec2; \
00317 int j; \
00318 \
00319 \
00320 \
00321 for(j=0 ; j<sfsize ; j++) { \
00322 result = flgr_get_array_##dtype(psrc1,j); \
00323 flgr_set_array_##dtype(pdest,j,result); \
00324 } \
00325 \
00326 vec_src1 = (vtype*) (psrc1+sfsize); \
00327 vec_src2 = (vtype*) psrc1; \
00328 vec_dest = (vtype*) (pdest+sfsize); \
00329 \
00330 for(j=sfsize ; j<length ; j+=vec_length) { \
00331 vec1 = vector128_load_unalign(vec_src1); \
00332 vec2 = _mm_load_si128(vec_src2); \
00333 vec1 = vector_instr(vec1,vec2); \
00334 _mm_storeu_si128(vec_dest,vec1); \
00335 vec_dest++;vec_src2++;vec_src1++; \
00336 } \
00337 \
00338 return
00339
00340
00341 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2(dtype,vtype, \
00342 scalar_instr, \
00343 vector_instr) \
00344 const int spp = datdest->spp; \
00345 const int sfsize = spp*left_shift_size; \
00346 const int length = datdest->length*spp; \
00347 const int vec_length = sizeof(vtype)/sizeof(dtype); \
00348 dtype *psrc1 = (dtype*) data1->array; \
00349 dtype *pdest = (dtype*) datdest->array; \
00350 dtype result; \
00351 vtype *vec_src1; \
00352 vtype *vec_src2; \
00353 vtype *vec_dest; \
00354 vtype vec1; \
00355 vtype vec2; \
00356 int j; \
00357 \
00358 \
00359 \
00360 vec_src1 = (vtype*) psrc1; \
00361 vec_src2 = (vtype*) (psrc1+sfsize); \
00362 vec_dest = (vtype*) pdest; \
00363 \
00364 for(j=0 ; j<length-sfsize ; j+=vec_length) { \
00365 vec1 = _mm_load_si128(vec_src1); \
00366 vec2 = vector128_load_unalign(vec_src2); \
00367 vec1 = vector_instr(vec1,vec2); \
00368 _mm_store_si128(vec_dest,vec1); \
00369 vec_dest++;vec_src2++;vec_src1++; \
00370 } \
00371 \
00372 for(j=length-sfsize ; j<length ; j++) { \
00373 result = flgr_get_array_##dtype(psrc1,j); \
00374 flgr_set_array_##dtype(pdest,j,result); \
00375 } \
00376 \
00377 return
00378
00379
00380
00381 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_1_RIGHT_SSE2(dtype,vtype, \
00382 scalar_instr, \
00383 vector_instr) \
00384 const int spp = datdest->spp; \
00385 const int length = datdest->length*spp; \
00386 const int vec_length = sizeof(vtype)/sizeof(dtype); \
00387 dtype *psrc = (dtype*) data1->array; \
00388 dtype *pdest = (dtype*) datdest->array; \
00389 dtype sca1; \
00390 dtype sca2; \
00391 vtype *vec_src; \
00392 vtype *vec_dest; \
00393 vtype vec1; \
00394 vtype vec2; \
00395 int j; \
00396 \
00397 \
00398 \
00399 vec_src = (vtype*) (psrc); \
00400 vec_dest = (vtype*) (pdest); \
00401 \
00402 vec1 = _mm_load_si128(vec_src); \
00403 vec2 = _mm_slli_si128(vec1,1); \
00404 vec1 = vector_instr(vec1,vec2); \
00405 _mm_store_si128(vec_dest,vec1); \
00406 \
00407 sca1 = flgr_get_array_##dtype(psrc,0); \
00408 flgr_set_array_##dtype(pdest,0,sca1); \
00409 \
00410 vec_src++;vec_dest++; \
00411 \
00412 \
00413 for(j=vec_length ; j<length ; j+=vec_length,vec_src++,vec_dest++) { \
00414 vec1 = _mm_load_si128(vec_src); \
00415 vec2 = _mm_slli_si128(vec1,1); \
00416 vec1 = vector_instr(vec1,vec2); \
00417 _mm_store_si128(vec_dest,vec1); \
00418 \
00419 sca1 = flgr_get_array_##dtype(psrc,j-1); \
00420 sca2 = flgr_get_array_##dtype(psrc,j); \
00421 sca1 = scalar_instr(sca1,sca2); \
00422 flgr_set_array_##dtype(pdest,j,sca1); \
00423 } \
00424 \
00425 return
00426
00427
00428
00429 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_1_LEFT_SSE2(dtype,vtype, \
00430 scalar_instr, \
00431 vector_instr) \
00432 const int spp = datdest->spp; \
00433 const int length = datdest->length*spp; \
00434 const int vec_length = sizeof(vtype)/sizeof(dtype); \
00435 dtype *psrc = (dtype*) data1->array; \
00436 dtype *pdest = (dtype*) datdest->array; \
00437 dtype sca1; \
00438 dtype sca2; \
00439 vtype *vec_src; \
00440 vtype *vec_dest; \
00441 vtype vec1; \
00442 vtype vec2; \
00443 int j; \
00444 \
00445 \
00446 \
00447 vec_src = (vtype*) (psrc); \
00448 vec_dest = (vtype*) (pdest); \
00449 \
00450 for(j=0 ; j<length-vec_length ; j+=vec_length,vec_src++,vec_dest++) { \
00451 vec1 = _mm_load_si128(vec_src); \
00452 vec2 = _mm_srli_si128(vec1,1); \
00453 vec1 = vector_instr(vec1,vec2); \
00454 _mm_store_si128(vec_dest,vec1); \
00455 \
00456 sca1 = flgr_get_array_##dtype(psrc,j+vec_length-1); \
00457 sca2 = flgr_get_array_##dtype(psrc,j+vec_length); \
00458 sca1 = scalar_instr(sca1,sca2); \
00459 flgr_set_array_##dtype(pdest,j+vec_length-1,sca1); \
00460 } \
00461 \
00462 j=length-vec_length; \
00463 \
00464 vec1 = _mm_load_si128(vec_src); \
00465 vec2 = _mm_srli_si128(vec1,1); \
00466 vec1 = vector_instr(vec1,vec2); \
00467 _mm_store_si128(vec_dest,vec1); \
00468 sca1 = flgr_get_array_##dtype(psrc,j+vec_length-1); \
00469 flgr_set_array_##dtype(pdest,j+vec_length-1,sca1); \
00470 \
00471 return
00472
00473
00474
00475
00476
00477
00478 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2_FLOAT64(scalar_instr, \
00479 vector_instr) \
00480 const int spp = datdest->spp; \
00481 const int sfsize = spp*right_shift_size; \
00482 const int length = datdest->length*spp; \
00483 const int vec_length = sizeof(__m128d)/sizeof(fgFLOAT64); \
00484 fgFLOAT64 *psrc1 = (fgFLOAT64*) data1->array; \
00485 fgFLOAT64 *pdest = (fgFLOAT64*) datdest->array; \
00486 fgFLOAT64 result; \
00487 __m128d *vec_src1; \
00488 __m128d *vec_src2; \
00489 __m128d *vec_dest; \
00490 __m128d vec1; \
00491 __m128d vec2; \
00492 int j; \
00493 \
00494 \
00495 \
00496 for(j=0 ; j<sfsize ; j++) { \
00497 result = flgr_get_array_fgFLOAT64(psrc1,j); \
00498 flgr_set_array_fgFLOAT64(pdest,j,result); \
00499 } \
00500 \
00501 vec_src1 = (__m128d*) (psrc1+sfsize); \
00502 vec_src2 = (__m128d*) psrc1; \
00503 vec_dest = (__m128d*) (pdest+sfsize); \
00504 \
00505 for(j=sfsize ; j<length ; j+=vec_length) { \
00506 vec1 = _mm_loadu_pd((double*) vec_src1); \
00507 vec2 = _mm_loadu_pd((double*) vec_src2); \
00508 vec1 = vector_instr(vec1,vec2); \
00509 _mm_storeu_pd((double*) vec_dest,vec1); \
00510 vec_dest++;vec_src2++;vec_src1++; \
00511 } \
00512 \
00513 return
00514
00515
00516 #define FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2_FLOAT64(scalar_instr, \
00517 vector_instr) \
00518 const int spp = datdest->spp; \
00519 const int sfsize = spp*left_shift_size; \
00520 const int length = datdest->length*spp; \
00521 const int vec_length = sizeof(__m128d)/sizeof(fgFLOAT64); \
00522 fgFLOAT64 *psrc1 = (fgFLOAT64*) data1->array; \
00523 fgFLOAT64 *pdest = (fgFLOAT64*) datdest->array; \
00524 fgFLOAT64 result; \
00525 __m128d *vec_src1; \
00526 __m128d *vec_src2; \
00527 __m128d *vec_dest; \
00528 __m128d vec1; \
00529 __m128d vec2; \
00530 int j; \
00531 \
00532 \
00533 \
00534 vec_src1 = (__m128d*) psrc1; \
00535 vec_src2 = (__m128d*) (psrc1+sfsize); \
00536 vec_dest = (__m128d*) pdest; \
00537 \
00538 for(j=0 ; j<length-sfsize ; j+=vec_length) { \
00539 vec1 = _mm_loadu_pd((double*) vec_src1); \
00540 vec2 = _mm_loadu_pd((double*) vec_src2); \
00541 vec1 = vector_instr(vec1,vec2); \
00542 _mm_storeu_pd((double*) vec_dest,vec1); \
00543 vec_dest++;vec_src2++;vec_src1++; \
00544 } \
00545 \
00546 for(j=length-sfsize ; j<length ; j++) { \
00547 result = flgr_get_array_fgFLOAT64(psrc1,j); \
00548 flgr_set_array_fgFLOAT64(pdest,j,result); \
00549 } \
00550 \
00551 return
00552
00553
00554
00555
00556
00557
00558
00559 void flgr1d_arith_sup_shift_1_right_fast_fgBIT(FLGR_Data1D *datdest, FLGR_Data1D *data1) {
00560 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_BIT(or,0);
00561 }
00562 void flgr1d_arith_sup_shift_right_fast_fgUINT8(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00563 #ifdef __SSE2__
00564 if((right_shift_size==1)&&(datdest->spp==1)) {
00565 FLGR_MACRO_1D_ARITH_OP_SHIFT_1_RIGHT_SSE2(fgUINT8, __m128i, flgr_defop_sup_fgUINT8, _mm_max_epu8);
00566 }else{
00567 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2(fgUINT8, __m128i, flgr_defop_sup_fgUINT8, _mm_max_epu8);
00568 }
00569 #else
00570 #if defined(__MMX__) && defined(__SSE__)
00571 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE(fgUINT8, __m64, flgr_defop_sup_fgUINT8, _mm_max_pu8);
00572 #else
00573 POST_ERROR("SSE or SSE2 not activated\n");
00574 #endif
00575 #endif
00576 }
00577 void flgr1d_arith_sup_shift_right_fast_fgINT16(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00578 #ifdef __SSE2__
00579 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2(fgINT16, __m128i, flgr_defop_sup_fgINT16, _mm_max_epi16);
00580 #else
00581 #if defined(__MMX__) && defined(__SSE__)
00582 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE(fgINT16, __m64, flgr_defop_sup_fgINT16, _mm_max_pi16);
00583 #else
00584 POST_ERROR("SSE or SSE2 not activated\n");
00585 #endif
00586 #endif
00587 }
00588 void flgr1d_arith_sup_shift_right_fast_fgFLOAT32(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00589 #if defined(__MMX__) && defined(__SSE__)
00590 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE_FLOAT32(flgr_defop_sup_fgFLOAT32, _mm_max_ps);
00591 #else
00592 POST_ERROR("SSE or SSE2 not activated\n");
00593 #endif
00594 }
00595 void flgr1d_arith_sup_shift_right_fast_fgFLOAT64(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00596 #ifdef __SSE2__
00597 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2_FLOAT64(flgr_defop_sup_fgFLOAT64, _mm_max_pd);
00598 #else
00599 POST_ERROR("SSE or SSE2 not activated\n");
00600 #endif
00601 }
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616 void flgr1d_arith_sup_shift_1_left_fast_fgBIT(FLGR_Data1D *datdest, FLGR_Data1D *data1) {
00617 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_BIT(or,0);
00618 }
00619 void flgr1d_arith_sup_shift_left_fast_fgUINT8(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00620 #ifdef __SSE2__
00621
00622
00623
00624 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2(fgUINT8, __m128i, flgr_defop_sup_fgUINT8, _mm_max_epu8);
00625
00626 #else
00627 #if defined(__MMX__) && defined(__SSE__)
00628 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE(fgUINT8, __m64, flgr_defop_sup_fgUINT8, _mm_max_pu8);
00629 #else
00630 POST_ERROR("SSE or SSE2 not activated\n");
00631 #endif
00632 #endif
00633 }
00634 void flgr1d_arith_sup_shift_left_fast_fgINT16(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00635 #ifdef __SSE2__
00636 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2(fgINT16, __m128i, flgr_defop_sup_fgINT16, _mm_max_epi16);
00637 #else
00638 #if defined(__MMX__) && defined(__SSE__)
00639 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE(fgINT16, __m64, flgr_defop_sup_fgINT16, _mm_max_pi16);
00640 #else
00641 POST_ERROR("SSE or SSE2 not activated\n");
00642 #endif
00643 #endif
00644 }
00645 void flgr1d_arith_sup_shift_left_fast_fgFLOAT32(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00646 #if defined(__MMX__) && defined(__SSE__)
00647 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE_FLOAT32(flgr_defop_sup_fgFLOAT32, _mm_max_ps);
00648 #else
00649 POST_ERROR("SSE or SSE2 not activated\n");
00650 #endif
00651 }
00652 void flgr1d_arith_sup_shift_left_fast_fgFLOAT64(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00653 #ifdef __SSE2__
00654 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2_FLOAT64(flgr_defop_sup_fgFLOAT64, _mm_max_pd);
00655 #else
00656 POST_ERROR("SSE or SSE2 not activated\n");
00657 #endif
00658 }
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669 void flgr1d_arith_inf_shift_1_right_fast_fgBIT(FLGR_Data1D *datdest, FLGR_Data1D *data1) {
00670 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_BIT(and,1);
00671 }
00672 void flgr1d_arith_inf_shift_right_fast_fgUINT8(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00673 #ifdef __SSE2__
00674 if((right_shift_size==1)&&(datdest->spp==1)) {
00675 FLGR_MACRO_1D_ARITH_OP_SHIFT_1_RIGHT_SSE2(fgUINT8, __m128i, flgr_defop_inf_fgUINT8, _mm_min_epu8);
00676 }else{
00677 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2(fgUINT8, __m128i, flgr_defop_inf_fgUINT8, _mm_min_epu8);
00678 }
00679 #else
00680 #if defined(__MMX__) && defined(__SSE__)
00681 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE(fgUINT8, __m64, flgr_defop_inf_fgUINT8, _mm_min_pu8);
00682 #else
00683 POST_ERROR("SSE or SSE2 not activated\n");
00684 #endif
00685 #endif
00686 }
00687 void flgr1d_arith_inf_shift_right_fast_fgINT16(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00688 #ifdef __SSE2__
00689 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2(fgINT16, __m128i, flgr_defop_inf_fgINT16, _mm_min_epi16);
00690 #else
00691 #if defined(__MMX__) && defined(__SSE__)
00692 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE(fgINT16, __m64, flgr_defop_inf_fgINT16, _mm_min_pi16);
00693 #else
00694 POST_ERROR("SSE or SSE2 not activated\n");
00695 #endif
00696 #endif
00697 }
00698 void flgr1d_arith_inf_shift_right_fast_fgFLOAT32(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00699 #if defined(__MMX__) && defined(__SSE__)
00700 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE_FLOAT32(flgr_defop_inf_fgFLOAT32, _mm_min_ps);
00701 #else
00702 POST_ERROR("SSE or SSE2 not activated\n");
00703 #endif
00704 }
00705 void flgr1d_arith_inf_shift_right_fast_fgFLOAT64(FLGR_Data1D *datdest, FLGR_Data1D *data1, int right_shift_size) {
00706 #ifdef __SSE2__
00707 FLGR_MACRO_1D_ARITH_OP_SHIFT_RIGHT_SSE2_FLOAT64(flgr_defop_inf_fgFLOAT64, _mm_min_pd);
00708 #else
00709 POST_ERROR("SSE or SSE2 not activated\n");
00710 #endif
00711 }
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728 void flgr1d_arith_inf_shift_1_left_fast_fgBIT(FLGR_Data1D *datdest, FLGR_Data1D *data1) {
00729 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_BIT(and,1);
00730 }
00731 void flgr1d_arith_inf_shift_left_fast_fgUINT8(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00732 #ifdef __SSE2__
00733
00734
00735
00736 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2(fgUINT8, __m128i, flgr_defop_inf_fgUINT8, _mm_min_epu8);
00737
00738 #else
00739 #if defined(__MMX__) && defined(__SSE__)
00740 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE(fgUINT8, __m64, flgr_defop_inf_fgUINT8, _mm_min_pu8);
00741 #else
00742 POST_ERROR("SSE or SSE2 not activated\n");
00743 #endif
00744 #endif
00745 }
00746 void flgr1d_arith_inf_shift_left_fast_fgINT16(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00747 #ifdef __SSE2__
00748 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2(fgINT16, __m128i, flgr_defop_inf_fgINT16, _mm_min_epi16);
00749 #else
00750 #if defined(__MMX__) && defined(__SSE__)
00751 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE(fgINT16, __m64, flgr_defop_inf_fgINT16, _mm_min_pi16);
00752 #else
00753 POST_ERROR("SSE or SSE2 not activated\n");
00754 #endif
00755 #endif
00756 }
00757 void flgr1d_arith_inf_shift_left_fast_fgFLOAT32(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00758 #if defined(__MMX__) && defined(__SSE__)
00759 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE_FLOAT32(flgr_defop_inf_fgFLOAT32, _mm_min_ps);
00760 #else
00761 POST_ERROR("SSE or SSE2 not activated\n");
00762 #endif
00763 }
00764 void flgr1d_arith_inf_shift_left_fast_fgFLOAT64(FLGR_Data1D *datdest, FLGR_Data1D *data1, int left_shift_size) {
00765 #ifdef __SSE2__
00766 FLGR_MACRO_1D_ARITH_OP_SHIFT_LEFT_SSE2_FLOAT64(flgr_defop_inf_fgFLOAT64, _mm_min_pd);
00767 #else
00768 POST_ERROR("SSE or SSE2 not activated\n");
00769 #endif
00770 }
00771
00772
00773
00774
00775
00776