00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <math.h>
00026
00027 #ifdef __SSE2__
00028 #include <emmintrin.h>
00029 #include <xmmintrin.h>
00030 #endif
00031
00032 #ifdef __MMX__
00033 #include <mmintrin.h>
00034 #endif
00035
00036 #include "flgrCoreDispatch.h"
00037 #include "flgrCoreTransposeFast.h"
00038 #include "flgrCoreTransposeFastUtil.h"
00039
00040 #ifdef __SSE2__
00041 #define BYTE_BLOCK_SIZE 16
00042 #define WORD_BLOCK_SIZE 8
00043 #define DWORD_BLOCK_SIZE 4
00044 #else
00045 #define BYTE_BLOCK_SIZE 8
00046 #define WORD_BLOCK_SIZE 4
00047 #define DWORD_BLOCK_SIZE 2
00048 #endif
00049
00050
00051 void flgr_transp_block_byte(FLGR_Data2D *dest, FLGR_Data2D *src, int fromRow, int fromCol, int toRow, int toCol) {
00052 #ifdef __SSE2__
00053 __m128i r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,tmp1;
00054
00055 r1 = *((__m128i*) ( src->array[fromRow]+fromCol ));
00056 r2 = *((__m128i*) ( src->array[fromRow+1]+fromCol ));
00057 r3 = *((__m128i*) ( src->array[fromRow+2]+fromCol ));
00058 r4 = *((__m128i*) ( src->array[fromRow+3]+fromCol ));
00059 r5 = *((__m128i*) ( src->array[fromRow+4]+fromCol ));
00060 r6 = *((__m128i*) ( src->array[fromRow+5]+fromCol ));
00061 r7 = *((__m128i*) ( src->array[fromRow+6]+fromCol ));
00062 r8 = *((__m128i*) ( src->array[fromRow+7]+fromCol ));
00063 r9 = *((__m128i*) ( src->array[fromRow+8]+fromCol ));
00064 r10 = *((__m128i*) ( src->array[fromRow+9]+fromCol ));
00065 r11 = *((__m128i*) ( src->array[fromRow+10]+fromCol ));
00066 r12 = *((__m128i*) ( src->array[fromRow+11]+fromCol ));
00067 r13 = *((__m128i*) ( src->array[fromRow+12]+fromCol ));
00068 r14 = *((__m128i*) ( src->array[fromRow+13]+fromCol ));
00069 r15 = *((__m128i*) ( src->array[fromRow+14]+fromCol ));
00070 r16 = *((__m128i*) ( src->array[fromRow+15]+fromCol ));
00071
00072 FLGR_MACRO_TRANSPOSE_BLOCK_16x16_BYTE(r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,tmp1);
00073
00074 *((__m128i*) ( dest->array[toRow+0]+toCol )) = r16;
00075 *((__m128i*) ( dest->array[toRow+1]+toCol )) = r8;
00076 *((__m128i*) ( dest->array[toRow+2]+toCol )) = r12;
00077 *((__m128i*) ( dest->array[toRow+3]+toCol )) = r4;
00078 *((__m128i*) ( dest->array[toRow+4]+toCol )) = r14;
00079 *((__m128i*) ( dest->array[toRow+5]+toCol )) = r6;
00080 *((__m128i*) ( dest->array[toRow+6]+toCol )) = r10;
00081 *((__m128i*) ( dest->array[toRow+7]+toCol )) = r2;
00082 *((__m128i*) ( dest->array[toRow+8]+toCol )) = r15;
00083 *((__m128i*) ( dest->array[toRow+9 ]+toCol )) = r7;
00084 *((__m128i*) ( dest->array[toRow+10]+toCol )) = r11;
00085 *((__m128i*) ( dest->array[toRow+11]+toCol )) = r3;
00086 *((__m128i*) ( dest->array[toRow+12]+toCol )) = r13;
00087 *((__m128i*) ( dest->array[toRow+13]+toCol )) = r5;
00088 *((__m128i*) ( dest->array[toRow+14]+toCol )) = r9;
00089 *((__m128i*) ( dest->array[toRow+15]+toCol )) = r1;
00090
00091 #else
00092 #ifdef __MMX__
00093 __m64 r1,r2,r3,r4,r5,r6,r7,r8,tmp1;
00094
00095 r1 = *((__m64*) ( src->array[fromRow]+fromCol ));
00096 r2 = *((__m64*) ( src->array[fromRow+1]+fromCol ));
00097 r3 = *((__m64*) ( src->array[fromRow+2]+fromCol ));
00098 r4 = *((__m64*) ( src->array[fromRow+3]+fromCol ));
00099 r5 = *((__m64*) ( src->array[fromRow+4]+fromCol ));
00100 r6 = *((__m64*) ( src->array[fromRow+5]+fromCol ));
00101 r7 = *((__m64*) ( src->array[fromRow+6]+fromCol ));
00102 r8 = *((__m64*) ( src->array[fromRow+7]+fromCol ));
00103
00104 FLGR_MACRO_TRANSPOSE_BLOCK_8x8_BYTE(tmp1,r1,r2,r3,r4,r5,r6,r7,r8);
00105
00106 *((__m64*) ( dest->array[toRow]+toCol )) = r8;
00107 *((__m64*) ( dest->array[toRow+1]+toCol )) = r4;
00108 *((__m64*) ( dest->array[toRow+2]+toCol )) = r6;
00109 *((__m64*) ( dest->array[toRow+3]+toCol )) = r2;
00110 *((__m64*) ( dest->array[toRow+4]+toCol )) = r7;
00111 *((__m64*) ( dest->array[toRow+5]+toCol )) = r3;
00112 *((__m64*) ( dest->array[toRow+6]+toCol )) = r5;
00113 *((__m64*) ( dest->array[toRow+7]+toCol )) = r1;
00114
00115 _mm_empty();
00116 #else
00117 POST_ERROR("MMX/SSE2 Instruction not supported!\n");
00118 #endif
00119 #endif
00120 }
00121
00122
00123 void flgr_transp_block_word(FLGR_Data2D *dest, FLGR_Data2D *src, int fromRow, int fromCol, int toRow, int toCol) {
00124 #ifdef __SSE2__
00125 __m128i r1,r2,r3,r4,r5,r6,r7,r8,tmp1;
00126 int fcol = fromCol*2;
00127 int tcol = toCol*2;
00128
00129 r1 = *((__m128i*) ( src->array[fromRow]+fcol ));
00130 r2 = *((__m128i*) ( src->array[fromRow+1]+fcol ));
00131 r3 = *((__m128i*) ( src->array[fromRow+2]+fcol ));
00132 r4 = *((__m128i*) ( src->array[fromRow+3]+fcol ));
00133 r5 = *((__m128i*) ( src->array[fromRow+4]+fcol ));
00134 r6 = *((__m128i*) ( src->array[fromRow+5]+fcol ));
00135 r7 = *((__m128i*) ( src->array[fromRow+6]+fcol ));
00136 r8 = *((__m128i*) ( src->array[fromRow+7]+fcol ));
00137
00138 FLGR_MACRO_TRANSPOSE_BLOCK_8x8_WORD(r1,r2,r3,r4,r5,r6,r7,r8,tmp1);
00139
00140 *((__m128i*) ( dest->array[toRow+0]+tcol )) = r8;
00141 *((__m128i*) ( dest->array[toRow+1]+tcol )) = r4;
00142 *((__m128i*) ( dest->array[toRow+2]+tcol )) = r6;
00143 *((__m128i*) ( dest->array[toRow+3]+tcol )) = r2;
00144 *((__m128i*) ( dest->array[toRow+4]+tcol )) = r7;
00145 *((__m128i*) ( dest->array[toRow+5]+tcol )) = r3;
00146 *((__m128i*) ( dest->array[toRow+6]+tcol )) = r5;
00147 *((__m128i*) ( dest->array[toRow+7]+tcol )) = r1;
00148
00149 #else
00150 POST_ERROR("SSE2 Instruction not supported!\n");
00151 #endif
00152 }
00153
00154
00155
00156
00157 void flgr_transp_block_dword(FLGR_Data2D *dest, FLGR_Data2D *src, int fromRow, int fromCol, int toRow, int toCol) {
00158 #ifdef __SSE2__
00159 __m128i r1,r2,r3,r4,tmp1;
00160 int fcol = fromCol*4;
00161 int tcol = toCol*4;
00162
00163 r1 = *((__m128i*) ( src->array[fromRow]+fcol ));
00164 r2 = *((__m128i*) ( src->array[fromRow+1]+fcol ));
00165 r3 = *((__m128i*) ( src->array[fromRow+2]+fcol ));
00166 r4 = *((__m128i*) ( src->array[fromRow+3]+fcol ));
00167
00168 FLGR_MACRO_TRANSPOSE_BLOCK_4x4_DWORD(r1,r2,r3,r4,tmp1);
00169
00170 *((__m128i*) ( dest->array[toRow+0]+tcol )) = r4;
00171 *((__m128i*) ( dest->array[toRow+1]+tcol )) = r2;
00172 *((__m128i*) ( dest->array[toRow+2]+tcol )) = r3;
00173 *((__m128i*) ( dest->array[toRow+3]+tcol )) = r1;
00174
00175 #else
00176 POST_ERROR("SSE2 Instruction not supported!\n");
00177 #endif
00178 }
00179
00180
00181
00182
00183
00184
00185 void flgr2d_transpose_fast_fgUINT8(FLGR_Data2D *imgdest, FLGR_Data2D *imgsrc) {
00186 int i,j;
00187
00188
00189
00190 for(i=0 ; i<imgsrc->size_y ; i+=BYTE_BLOCK_SIZE) {
00191 for(j=0 ; j<imgsrc->size_x ; j+=BYTE_BLOCK_SIZE) {
00192 flgr_transp_block_byte(imgdest,imgsrc,i,j,j,i);
00193 }
00194 }
00195
00196 }
00197
00198 void flgr2d_transpose_fast_fgUINT16(FLGR_Data2D *imgdest, FLGR_Data2D *imgsrc) {
00199 int i,j;
00200
00201
00202
00203 for(i=0 ; i<imgsrc->size_y ; i+=WORD_BLOCK_SIZE) {
00204 for(j=0 ; j<imgsrc->size_x ; j+=WORD_BLOCK_SIZE) {
00205 flgr_transp_block_word(imgdest,imgsrc,i,j,j,i);
00206 }
00207 }
00208
00209 }
00210
00211 void flgr2d_transpose_fast_fgUINT32(FLGR_Data2D *imgdest, FLGR_Data2D *imgsrc) {
00212 int i,j;
00213
00214
00215
00216 for(i=0 ; i<imgsrc->size_y ; i+=DWORD_BLOCK_SIZE) {
00217 for(j=0 ; j<imgsrc->size_x ; j+=DWORD_BLOCK_SIZE) {
00218 flgr_transp_block_dword(imgdest,imgsrc,i,j,j,i);
00219 }
00220 }
00221
00222 }