SDL_gfx
2.0.24
|
00001 /* 00002 00003 SDL_imageFilter.c: byte-image "filter" routines 00004 00005 Copyright (C) 2001-2012 Andreas Schiffler 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for any damages 00009 arising from the use of this software. 00010 00011 Permission is granted to anyone to use this software for any purpose, 00012 including commercial applications, and to alter it and redistribute it 00013 freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must not 00016 claim that you wrote the original software. If you use this software 00017 in a product, an acknowledgment in the product documentation would be 00018 appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must not be 00021 misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any source 00024 distribution. 00025 00026 Andreas Schiffler -- aschiffler at ferzkopp dot net 00027 00028 */ 00029 00030 /* 00031 00032 Note: Uses inline x86 MMX or ASM optimizations if available and enabled. 00033 00034 Note: Most of the MMX code is based on published routines 00035 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 00036 him for his work. 00037 00038 */ 00039 00040 #include <stdio.h> 00041 #include <stdlib.h> 00042 #include <string.h> 00043 00044 #include "SDL_imageFilter.h" 00045 00049 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8) | (((x) & 0x0000ff00) << 8) | ((x) << 24)) 00050 00051 /* ------ Static variables ----- */ 00052 00056 static int SDL_imageFilterUseMMX = 1; 00057 00058 /* Detect GCC */ 00059 #if defined(__GNUC__) 00060 #define GCC__ 00061 #endif 00062 00068 unsigned int _cpuFlags() 00069 { 00070 unsigned int flags = 0; 00071 00072 #ifdef USE_MMX 00073 #if !defined(GCC__) 00074 __asm 00075 { 00076 pusha 00077 mov eax, 1 00078 cpuid /* get CPU ID flag */ 00079 mov flags,edx /* move result to mmx_bit */ 00080 popa 00081 } 00082 #else 00083 asm volatile ("pusha \n\t" "mov %1, %%eax \n\t" /* request feature flag */ 00084 "cpuid \n\t" /* get CPU ID flag */ 00085 "mov %%edx, %0 \n\t" /* move result to mmx_bit */ 00086 "popa \n\t":"=m" (flags) /* %0 */ 00087 :"i"(0x00000001) /* %1 */ 00088 ); 00089 #endif 00090 #endif 00091 00092 return (flags); 00093 } 00094 00100 int SDL_imageFilterMMXdetect(void) 00101 { 00102 unsigned int mmx_bit; 00103 00104 /* Check override flag */ 00105 if (SDL_imageFilterUseMMX == 0) { 00106 return (0); 00107 } 00108 00109 mmx_bit = _cpuFlags(); 00110 mmx_bit &= 0x00800000; 00111 mmx_bit = (mmx_bit && 0x00800000); 00112 00113 return (int)(mmx_bit); 00114 } 00115 00119 void SDL_imageFilterMMXoff() 00120 { 00121 SDL_imageFilterUseMMX = 0; 00122 } 00123 00127 void SDL_imageFilterMMXon() 00128 { 00129 SDL_imageFilterUseMMX = 1; 00130 } 00131 00132 /* ------------------------------------------------------------------------------------ */ 00133 00144 int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00145 { 00146 #ifdef USE_MMX 00147 #if !defined(GCC__) 00148 __asm 00149 { 00150 pusha 00151 mov eax, Src1 /* load Src1 address into eax */ 00152 mov ebx, Src2 /* load Src2 address into ebx */ 00153 mov edi, Dest /* load Dest address into edi */ 00154 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00155 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00156 align 16 /* 16 byte alignment of the loop entry */ 00157 L1010: 00158 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00159 paddusb mm1, [ebx] /* mm1=Src1+Src2 (add 8 bytes with saturation) */ 00160 movq [edi], mm1 /* store result in Dest */ 00161 add eax, 8 /* increase Src1, Src2 and Dest */ 00162 add ebx, 8 /* register pointers by 8 */ 00163 add edi, 8 00164 dec ecx /* decrease loop counter */ 00165 jnz L1010 /* check loop termination, proceed if required */ 00166 emms /* exit MMX state */ 00167 popa 00168 } 00169 #else 00170 asm volatile 00171 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 00172 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 00173 "mov %0, %%edi \n\t" /* load Dest address into edi */ 00174 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 00175 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 00176 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00177 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 00178 "paddusb (%%ebx), %%mm1 \n\t" /* mm1=Src1+Src2 (add 8 bytes with saturation) */ 00179 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 00180 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 00181 "add $8, %%ebx \n\t" /* register pointers by 8 */ 00182 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 00183 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00184 "emms \n\t" /* exit MMX state */ 00185 "popa \n\t":"=m" (Dest) /* %0 */ 00186 :"m"(Src2), /* %1 */ 00187 "m"(Src1), /* %2 */ 00188 "m"(SrcLength) /* %3 */ 00189 ); 00190 #endif 00191 return (0); 00192 #else 00193 return (-1); 00194 #endif 00195 } 00196 00207 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00208 { 00209 unsigned int i, istart; 00210 unsigned char *cursrc1, *cursrc2, *curdst; 00211 int result; 00212 00213 /* Validate input parameters */ 00214 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00215 return(-1); 00216 if (length == 0) 00217 return(0); 00218 00219 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00220 00221 /* Use MMX assembly routine */ 00222 SDL_imageFilterAddMMX(Src1, Src2, Dest, length); 00223 00224 /* Check for unaligned bytes */ 00225 if ((length & 7) > 0) { 00226 /* Setup to process unaligned bytes */ 00227 istart = length & 0xfffffff8; 00228 cursrc1 = &Src1[istart]; 00229 cursrc2 = &Src2[istart]; 00230 curdst = &Dest[istart]; 00231 } else { 00232 /* No unaligned bytes - we are done */ 00233 return (0); 00234 } 00235 } else { 00236 /* Setup to process whole image */ 00237 istart = 0; 00238 cursrc1 = Src1; 00239 cursrc2 = Src2; 00240 curdst = Dest; 00241 } 00242 00243 /* C routine to process image */ 00244 for (i = istart; i < length; i++) { 00245 result = (int) *cursrc1 + (int) *cursrc2; 00246 if (result > 255) 00247 result = 255; 00248 *curdst = (unsigned char) result; 00249 /* Advance pointers */ 00250 cursrc1++; 00251 cursrc2++; 00252 curdst++; 00253 } 00254 00255 return (0); 00256 } 00257 00269 int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength, 00270 unsigned char *Mask) 00271 { 00272 #ifdef USE_MMX 00273 #if !defined(GCC__) 00274 __asm 00275 { 00276 pusha 00277 mov edx, Mask /* load Mask address into edx */ 00278 movq mm0, [edx] /* load Mask into mm0 */ 00279 mov eax, Src1 /* load Src1 address into eax */ 00280 mov ebx, Src2 /* load Src2 address into ebx */ 00281 mov edi, Dest /* load Dest address into edi */ 00282 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00283 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00284 align 16 /* 16 byte alignment of the loop entry */ 00285 L21011: 00286 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00287 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */ 00288 /* --- Byte shift via Word shift --- */ 00289 psrlw mm1, 1 /* shift 4 WORDS of mm1 1 bit to the right */ 00290 psrlw mm2, 1 /* shift 4 WORDS of mm2 1 bit to the right */ 00291 pand mm1, mm0 // apply Mask to 8 BYTES of mm1 */ 00292 /* byte 0x0f, 0xdb, 0xc8 */ 00293 pand mm2, mm0 // apply Mask to 8 BYTES of mm2 */ 00294 /* byte 0x0f, 0xdb, 0xd0 */ 00295 paddusb mm1, mm2 /* mm1=mm1+mm2 (add 8 bytes with saturation) */ 00296 movq [edi], mm1 /* store result in Dest */ 00297 add eax, 8 /* increase Src1, Src2 and Dest */ 00298 add ebx, 8 /* register pointers by 8 */ 00299 add edi, 8 00300 dec ecx /* decrease loop counter */ 00301 jnz L21011 /* check loop termination, proceed if required */ 00302 emms /* exit MMX state */ 00303 popa 00304 } 00305 #else 00306 asm volatile 00307 ("pusha \n\t" "movl %4, %%edx \n\t" /* load Mask address into edx */ 00308 "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ 00309 "mov %2, %%eax \n\t" /* load Src1 address into eax */ 00310 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 00311 "mov %0, %%edi \n\t" /* load Dest address into edi */ 00312 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 00313 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 00314 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00315 "1: \n\t" 00316 "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 00317 "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */ 00318 /* --- Byte shift via Word shift --- */ 00319 "psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of mm1 1 bit to the right */ 00320 "psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of mm2 1 bit to the right */ 00321 /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of mm1 */ 00322 ".byte 0x0f, 0xdb, 0xc8 \n\t" 00323 /* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of mm2 */ 00324 ".byte 0x0f, 0xdb, 0xd0 \n\t" 00325 "paddusb %%mm2, %%mm1 \n\t" /* mm1=mm1+mm2 (add 8 bytes with saturation) */ 00326 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 00327 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 00328 "add $8, %%ebx \n\t" /* register pointers by 8 */ 00329 "add $8, %%edi \n\t" 00330 "dec %%ecx \n\t" /* decrease loop counter */ 00331 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00332 "emms \n\t" /* exit MMX state */ 00333 "popa \n\t":"=m" (Dest) /* %0 */ 00334 :"m"(Src2), /* %1 */ 00335 "m"(Src1), /* %2 */ 00336 "m"(SrcLength), /* %3 */ 00337 "m"(Mask) /* %4 */ 00338 ); 00339 #endif 00340 return (0); 00341 #else 00342 return (-1); 00343 #endif 00344 } 00345 00356 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00357 { 00358 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 00359 unsigned int i, istart; 00360 unsigned char *cursrc1, *cursrc2, *curdst; 00361 int result; 00362 00363 /* Validate input parameters */ 00364 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00365 return(-1); 00366 if (length == 0) 00367 return(0); 00368 00369 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00370 /* MMX routine */ 00371 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask); 00372 00373 /* Check for unaligned bytes */ 00374 if ((length & 7) > 0) { 00375 /* Setup to process unaligned bytes */ 00376 istart = length & 0xfffffff8; 00377 cursrc1 = &Src1[istart]; 00378 cursrc2 = &Src2[istart]; 00379 curdst = &Dest[istart]; 00380 } else { 00381 /* No unaligned bytes - we are done */ 00382 return (0); 00383 } 00384 } else { 00385 /* Setup to process whole image */ 00386 istart = 0; 00387 cursrc1 = Src1; 00388 cursrc2 = Src2; 00389 curdst = Dest; 00390 } 00391 00392 /* C routine to process image */ 00393 for (i = istart; i < length; i++) { 00394 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2; 00395 *curdst = (unsigned char) result; 00396 /* Advance pointers */ 00397 cursrc1++; 00398 cursrc2++; 00399 curdst++; 00400 } 00401 00402 return (0); 00403 } 00404 00415 int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00416 { 00417 #ifdef USE_MMX 00418 #if !defined(GCC__) 00419 __asm 00420 { 00421 pusha 00422 mov eax, Src1 /* load Src1 address into eax */ 00423 mov ebx, Src2 /* load Src2 address into ebx */ 00424 mov edi, Dest /* load Dest address into edi */ 00425 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00426 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00427 align 16 /* 16 byte alignment of the loop entry */ 00428 L1012: 00429 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00430 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00431 movq [edi], mm1 /* store result in Dest */ 00432 add eax, 8 /* increase Src1, Src2 and Dest */ 00433 add ebx, 8 /* register pointers by 8 */ 00434 add edi, 8 00435 dec ecx /* decrease loop counter */ 00436 jnz L1012 /* check loop termination, proceed if required */ 00437 emms /* exit MMX state */ 00438 popa 00439 } 00440 #else 00441 asm volatile 00442 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 00443 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 00444 "mov %0, %%edi \n\t" /* load Dest address into edi */ 00445 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 00446 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 00447 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00448 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 00449 "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00450 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 00451 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 00452 "add $8, %%ebx \n\t" /* register pointers by 8 */ 00453 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 00454 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00455 "emms \n\t" /* exit MMX state */ 00456 "popa \n\t":"=m" (Dest) /* %0 */ 00457 :"m"(Src2), /* %1 */ 00458 "m"(Src1), /* %2 */ 00459 "m"(SrcLength) /* %3 */ 00460 ); 00461 #endif 00462 return (0); 00463 #else 00464 return (-1); 00465 #endif 00466 } 00467 00478 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00479 { 00480 unsigned int i, istart; 00481 unsigned char *cursrc1, *cursrc2, *curdst; 00482 int result; 00483 00484 /* Validate input parameters */ 00485 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00486 return(-1); 00487 if (length == 0) 00488 return(0); 00489 00490 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00491 /* MMX routine */ 00492 SDL_imageFilterSubMMX(Src1, Src2, Dest, length); 00493 00494 /* Check for unaligned bytes */ 00495 if ((length & 7) > 0) { 00496 /* Setup to process unaligned bytes */ 00497 istart = length & 0xfffffff8; 00498 cursrc1 = &Src1[istart]; 00499 cursrc2 = &Src2[istart]; 00500 curdst = &Dest[istart]; 00501 } else { 00502 /* No unaligned bytes - we are done */ 00503 return (0); 00504 } 00505 } else { 00506 /* Setup to process whole image */ 00507 istart = 0; 00508 cursrc1 = Src1; 00509 cursrc2 = Src2; 00510 curdst = Dest; 00511 } 00512 00513 /* C routine to process image */ 00514 for (i = istart; i < length; i++) { 00515 result = (int) *cursrc1 - (int) *cursrc2; 00516 if (result < 0) 00517 result = 0; 00518 *curdst = (unsigned char) result; 00519 /* Advance pointers */ 00520 cursrc1++; 00521 cursrc2++; 00522 curdst++; 00523 } 00524 00525 return (0); 00526 } 00527 00538 int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00539 { 00540 #ifdef USE_MMX 00541 #if !defined(GCC__) 00542 __asm 00543 { 00544 pusha 00545 mov eax, Src1 /* load Src1 address into eax */ 00546 mov ebx, Src2 /* load Src2 address into ebx */ 00547 mov edi, Dest /* load Dest address into edi */ 00548 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00549 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00550 align 16 /* 16 byte alignment of the loop entry */ 00551 L1013: 00552 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00553 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */ 00554 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00555 psubusb mm2, [eax] /* mm2=Src2-Src1 (sub 8 bytes with saturation) */ 00556 por mm1, mm2 /* combine both mm2 and mm1 results */ 00557 movq [edi], mm1 /* store result in Dest */ 00558 add eax, 8 /* increase Src1, Src2 and Dest */ 00559 add ebx, 8 /* register pointers by 8 */ 00560 add edi, 8 00561 dec ecx /* decrease loop counter */ 00562 jnz L1013 /* check loop termination, proceed if required */ 00563 emms /* exit MMX state */ 00564 popa 00565 } 00566 #else 00567 asm volatile 00568 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 00569 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 00570 "mov %0, %%edi \n\t" /* load Dest address into edi */ 00571 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 00572 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 00573 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00574 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 00575 "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */ 00576 "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00577 "psubusb (%%eax), %%mm2 \n\t" /* mm2=Src2-Src1 (sub 8 bytes with saturation) */ 00578 "por %%mm2, %%mm1 \n\t" /* combine both mm2 and mm1 results */ 00579 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 00580 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 00581 "add $8, %%ebx \n\t" /* register pointers by 8 */ 00582 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 00583 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00584 "emms \n\t" /* exit MMX state */ 00585 "popa \n\t":"=m" (Dest) /* %0 */ 00586 :"m"(Src2), /* %1 */ 00587 "m"(Src1), /* %2 */ 00588 "m"(SrcLength) /* %3 */ 00589 ); 00590 #endif 00591 return (0); 00592 #else 00593 return (-1); 00594 #endif 00595 } 00596 00607 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00608 { 00609 unsigned int i, istart; 00610 unsigned char *cursrc1, *cursrc2, *curdst; 00611 int result; 00612 00613 /* Validate input parameters */ 00614 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00615 return(-1); 00616 if (length == 0) 00617 return(0); 00618 00619 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00620 /* MMX routine */ 00621 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length); 00622 00623 /* Check for unaligned bytes */ 00624 if ((length & 7) > 0) { 00625 /* Setup to process unaligned bytes */ 00626 istart = length & 0xfffffff8; 00627 cursrc1 = &Src1[istart]; 00628 cursrc2 = &Src2[istart]; 00629 curdst = &Dest[istart]; 00630 } else { 00631 /* No unaligned bytes - we are done */ 00632 return (0); 00633 } 00634 } else { 00635 /* Setup to process whole image */ 00636 istart = 0; 00637 cursrc1 = Src1; 00638 cursrc2 = Src2; 00639 curdst = Dest; 00640 } 00641 00642 /* C routine to process image */ 00643 for (i = istart; i < length; i++) { 00644 result = abs((int) *cursrc1 - (int) *cursrc2); 00645 *curdst = (unsigned char) result; 00646 /* Advance pointers */ 00647 cursrc1++; 00648 cursrc2++; 00649 curdst++; 00650 } 00651 00652 return (0); 00653 } 00654 00665 int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00666 { 00667 #ifdef USE_MMX 00668 #if !defined(GCC__) 00669 __asm 00670 { 00671 pusha 00672 mov eax, Src1 /* load Src1 address into eax */ 00673 mov ebx, Src2 /* load Src2 address into ebx */ 00674 mov edi, Dest /* load Dest address into edi */ 00675 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00676 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00677 pxor mm0, mm0 /* zero mm0 register */ 00678 align 16 /* 16 byte alignment of the loop entry */ 00679 L1014: 00680 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00681 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 00682 movq mm2, mm1 /* copy mm1 into mm2 */ 00683 movq mm4, mm3 /* copy mm3 into mm4 */ 00684 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 00685 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 00686 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 00687 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 00688 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 00689 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 00690 /* Take abs value of the results (signed words) */ 00691 movq mm5, mm1 /* copy mm1 into mm5 */ 00692 movq mm6, mm2 /* copy mm2 into mm6 */ 00693 psraw mm5, 15 /* fill mm5 words with word sign bit */ 00694 psraw mm6, 15 /* fill mm6 words with word sign bit */ 00695 pxor mm1, mm5 /* take 1's compliment of only neg. words */ 00696 pxor mm2, mm6 /* take 1's compliment of only neg. words */ 00697 psubsw mm1, mm5 /* add 1 to only neg. words, W-(-1) or W-0 */ 00698 psubsw mm2, mm6 /* add 1 to only neg. words, W-(-1) or W-0 */ 00699 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 00700 movq [edi], mm1 /* store result in Dest */ 00701 add eax, 8 /* increase Src1, Src2 and Dest */ 00702 add ebx, 8 /* register pointers by 8 */ 00703 add edi, 8 00704 dec ecx /* decrease loop counter */ 00705 jnz L1014 /* check loop termination, proceed if required */ 00706 emms /* exit MMX state */ 00707 popa 00708 } 00709 #else 00710 asm volatile 00711 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 00712 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 00713 "mov %0, %%edi \n\t" /* load Dest address into edi */ 00714 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 00715 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 00716 "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ 00717 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00718 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 00719 "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ 00720 "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ 00721 "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ 00722 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ 00723 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ 00724 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */ 00725 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */ 00726 "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */ 00727 "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */ 00728 /* Take abs value of the results (signed words) */ 00729 "movq %%mm1, %%mm5 \n\t" /* copy mm1 into mm5 */ 00730 "movq %%mm2, %%mm6 \n\t" /* copy mm2 into mm6 */ 00731 "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */ 00732 "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */ 00733 "pxor %%mm5, %%mm1 \n\t" /* take 1's compliment of only neg. words */ 00734 "pxor %%mm6, %%mm2 \n\t" /* take 1's compliment of only neg. words */ 00735 "psubsw %%mm5, %%mm1 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 00736 "psubsw %%mm6, %%mm2 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 00737 "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ 00738 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 00739 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 00740 "add $8, %%ebx \n\t" /* register pointers by 8 */ 00741 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 00742 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00743 "emms \n\t" /* exit MMX state */ 00744 "popa \n\t":"=m" (Dest) /* %0 */ 00745 :"m"(Src2), /* %1 */ 00746 "m"(Src1), /* %2 */ 00747 "m"(SrcLength) /* %3 */ 00748 ); 00749 #endif 00750 return (0); 00751 #else 00752 return (-1); 00753 #endif 00754 } 00755 00766 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00767 { 00768 unsigned int i, istart; 00769 unsigned char *cursrc1, *cursrc2, *curdst; 00770 int result; 00771 00772 /* Validate input parameters */ 00773 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00774 return(-1); 00775 if (length == 0) 00776 return(0); 00777 00778 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00779 /* MMX routine */ 00780 SDL_imageFilterMultMMX(Src1, Src2, Dest, length); 00781 00782 /* Check for unaligned bytes */ 00783 if ((length & 7) > 0) { 00784 /* Setup to process unaligned bytes */ 00785 istart = length & 0xfffffff8; 00786 cursrc1 = &Src1[istart]; 00787 cursrc2 = &Src2[istart]; 00788 curdst = &Dest[istart]; 00789 } else { 00790 /* No unaligned bytes - we are done */ 00791 return (0); 00792 } 00793 } else { 00794 /* Setup to process whole image */ 00795 istart = 0; 00796 cursrc1 = Src1; 00797 cursrc2 = Src2; 00798 curdst = Dest; 00799 } 00800 00801 /* C routine to process image */ 00802 for (i = istart; i < length; i++) { 00803 00804 /* NOTE: this is probably wrong - dunno what the MMX code does */ 00805 00806 result = (int) *cursrc1 * (int) *cursrc2; 00807 if (result > 255) 00808 result = 255; 00809 *curdst = (unsigned char) result; 00810 /* Advance pointers */ 00811 cursrc1++; 00812 cursrc2++; 00813 curdst++; 00814 } 00815 00816 return (0); 00817 } 00818 00829 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00830 { 00831 #ifdef USE_MMX 00832 #if !defined(GCC__) 00833 __asm 00834 { 00835 pusha 00836 mov edx, Src1 /* load Src1 address into edx */ 00837 mov esi, Src2 /* load Src2 address into esi */ 00838 mov edi, Dest /* load Dest address into edi */ 00839 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00840 align 16 /* 16 byte alignment of the loop entry */ 00841 L10141: 00842 mov al, [edx] /* load a byte from Src1 */ 00843 mul [esi] /* mul with a byte from Src2 */ 00844 mov [edi], al /* move a byte result to Dest */ 00845 inc edx /* increment Src1, Src2, Dest */ 00846 inc esi /* pointer registers by one */ 00847 inc edi 00848 dec ecx /* decrease loop counter */ 00849 jnz L10141 /* check loop termination, proceed if required */ 00850 popa 00851 } 00852 #else 00853 asm volatile 00854 ("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */ 00855 "mov %1, %%esi \n\t" /* load Src2 address into esi */ 00856 "mov %0, %%edi \n\t" /* load Dest address into edi */ 00857 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 00858 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00859 "1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */ 00860 "mulb (%%esi) \n\t" /* mul with a byte from Src2 */ 00861 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ 00862 "inc %%edx \n\t" /* increment Src1, Src2, Dest */ 00863 "inc %%esi \n\t" /* pointer registers by one */ 00864 "inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 00865 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00866 "popa \n\t":"=m" (Dest) /* %0 */ 00867 :"m"(Src2), /* %1 */ 00868 "m"(Src1), /* %2 */ 00869 "m"(SrcLength) /* %3 */ 00870 ); 00871 #endif 00872 return (0); 00873 #else 00874 return (-1); 00875 #endif 00876 } 00877 00888 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00889 { 00890 unsigned int i, istart; 00891 unsigned char *cursrc1, *cursrc2, *curdst; 00892 int result; 00893 00894 /* Validate input parameters */ 00895 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00896 return(-1); 00897 if (length == 0) 00898 return(0); 00899 00900 if (SDL_imageFilterMMXdetect()) { 00901 if (length > 0) { 00902 /* ASM routine */ 00903 SDL_imageFilterMultNorASM(Src1, Src2, Dest, length); 00904 00905 /* Check for unaligned bytes */ 00906 if ((length & 7) > 0) { 00907 /* Setup to process unaligned bytes */ 00908 istart = length & 0xfffffff8; 00909 cursrc1 = &Src1[istart]; 00910 cursrc2 = &Src2[istart]; 00911 curdst = &Dest[istart]; 00912 } else { 00913 /* No unaligned bytes - we are done */ 00914 return (0); 00915 } 00916 } else { 00917 /* No bytes - we are done */ 00918 return (0); 00919 } 00920 } else { 00921 /* Setup to process whole image */ 00922 istart = 0; 00923 cursrc1 = Src1; 00924 cursrc2 = Src2; 00925 curdst = Dest; 00926 } 00927 00928 /* C routine to process image */ 00929 for (i = istart; i < length; i++) { 00930 result = (int) *cursrc1 * (int) *cursrc2; 00931 *curdst = (unsigned char) result; 00932 /* Advance pointers */ 00933 cursrc1++; 00934 cursrc2++; 00935 curdst++; 00936 } 00937 00938 return (0); 00939 } 00940 00951 int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00952 { 00953 #ifdef USE_MMX 00954 #if !defined(GCC__) 00955 __asm 00956 { 00957 pusha 00958 mov eax, Src1 /* load Src1 address into eax */ 00959 mov ebx, Src2 /* load Src2 address into ebx */ 00960 mov edi, Dest /* load Dest address into edi */ 00961 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00962 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00963 pxor mm0, mm0 /* zero mm0 register */ 00964 align 16 /* 16 byte alignment of the loop entry */ 00965 L1015: 00966 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00967 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 00968 movq mm2, mm1 /* copy mm1 into mm2 */ 00969 movq mm4, mm3 /* copy mm3 into mm4 */ 00970 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 00971 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 00972 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 00973 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 00974 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */ 00975 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */ 00976 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 00977 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 00978 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 00979 movq [edi], mm1 /* store result in Dest */ 00980 add eax, 8 /* increase Src1, Src2 and Dest */ 00981 add ebx, 8 /* register pointers by 8 */ 00982 add edi, 8 00983 dec ecx /* decrease loop counter */ 00984 jnz L1015 /* check loop termination, proceed if required */ 00985 emms /* exit MMX state */ 00986 popa 00987 } 00988 #else 00989 asm volatile 00990 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 00991 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 00992 "mov %0, %%edi \n\t" /* load Dest address into edi */ 00993 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 00994 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 00995 "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ 00996 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00997 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 00998 "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ 00999 "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ 01000 "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ 01001 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ 01002 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ 01003 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */ 01004 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */ 01005 "psrlw $1, %%mm1 \n\t" /* divide mm1 words by 2, Src1 low bytes */ 01006 "psrlw $1, %%mm2 \n\t" /* divide mm2 words by 2, Src1 high bytes */ 01007 "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */ 01008 "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */ 01009 "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ 01010 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 01011 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 01012 "add $8, %%ebx \n\t" /* register pointers by 8 */ 01013 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 01014 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01015 "emms \n\t" /* exit MMX state */ 01016 "popa \n\t":"=m" (Dest) /* %0 */ 01017 :"m"(Src2), /* %1 */ 01018 "m"(Src1), /* %2 */ 01019 "m"(SrcLength) /* %3 */ 01020 ); 01021 #endif 01022 return (0); 01023 #else 01024 return (-1); 01025 #endif 01026 } 01027 01038 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01039 { 01040 unsigned int i, istart; 01041 unsigned char *cursrc1, *cursrc2, *curdst; 01042 int result; 01043 01044 /* Validate input parameters */ 01045 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01046 return(-1); 01047 if (length == 0) 01048 return(0); 01049 01050 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01051 /* MMX routine */ 01052 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length); 01053 01054 /* Check for unaligned bytes */ 01055 if ((length & 7) > 0) { 01056 /* Setup to process unaligned bytes */ 01057 istart = length & 0xfffffff8; 01058 cursrc1 = &Src1[istart]; 01059 cursrc2 = &Src2[istart]; 01060 curdst = &Dest[istart]; 01061 } else { 01062 /* No unaligned bytes - we are done */ 01063 return (0); 01064 } 01065 } else { 01066 /* Setup to process whole image */ 01067 istart = 0; 01068 cursrc1 = Src1; 01069 cursrc2 = Src2; 01070 curdst = Dest; 01071 } 01072 01073 /* C routine to process image */ 01074 for (i = istart; i < length; i++) { 01075 result = ((int) *cursrc1 / 2) * (int) *cursrc2; 01076 if (result > 255) 01077 result = 255; 01078 *curdst = (unsigned char) result; 01079 /* Advance pointers */ 01080 cursrc1++; 01081 cursrc2++; 01082 curdst++; 01083 } 01084 01085 return (0); 01086 } 01087 01098 int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01099 { 01100 #ifdef USE_MMX 01101 #if !defined(GCC__) 01102 __asm 01103 { 01104 pusha 01105 mov eax, Src1 /* load Src1 address into eax */ 01106 mov ebx, Src2 /* load Src2 address into ebx */ 01107 mov edi, Dest /* load Dest address into edi */ 01108 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01109 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01110 pxor mm0, mm0 /* zero mm0 register */ 01111 align 16 /* 16 byte alignment of the loop entry */ 01112 L1016: 01113 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01114 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 01115 movq mm2, mm1 /* copy mm1 into mm2 */ 01116 movq mm4, mm3 /* copy mm3 into mm4 */ 01117 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 01118 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 01119 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 01120 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 01121 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */ 01122 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */ 01123 psrlw mm3, 1 /* divide mm3 words by 2, Src2 low bytes */ 01124 psrlw mm4, 1 /* divide mm4 words by 2, Src2 high bytes */ 01125 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 01126 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 01127 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 01128 movq [edi], mm1 /* store result in Dest */ 01129 add eax, 8 /* increase Src1, Src2 and Dest */ 01130 add ebx, 8 /* register pointers by 8 */ 01131 add edi, 8 01132 dec ecx /* decrease loop counter */ 01133 jnz L1016 /* check loop termination, proceed if required */ 01134 emms /* exit MMX state */ 01135 popa 01136 } 01137 #else 01138 asm volatile 01139 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 01140 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 01141 "mov %0, %%edi \n\t" /* load Dest address into edi */ 01142 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 01143 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 01144 "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ 01145 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01146 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 01147 "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ 01148 "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ 01149 "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ 01150 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ 01151 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ 01152 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */ 01153 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */ 01154 "psrlw $1, %%mm1 \n\t" /* divide mm1 words by 2, Src1 low bytes */ 01155 "psrlw $1, %%mm2 \n\t" /* divide mm2 words by 2, Src1 high bytes */ 01156 "psrlw $1, %%mm3 \n\t" /* divide mm3 words by 2, Src2 low bytes */ 01157 "psrlw $1, %%mm4 \n\t" /* divide mm4 words by 2, Src2 high bytes */ 01158 "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */ 01159 "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */ 01160 "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ 01161 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 01162 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 01163 "add $8, %%ebx \n\t" /* register pointers by 8 */ 01164 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 01165 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01166 "emms \n\t" /* exit MMX state */ 01167 "popa \n\t":"=m" (Dest) /* %0 */ 01168 :"m"(Src2), /* %1 */ 01169 "m"(Src1), /* %2 */ 01170 "m"(SrcLength) /* %3 */ 01171 ); 01172 #endif 01173 return (0); 01174 #else 01175 return (-1); 01176 #endif 01177 } 01178 01189 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01190 { 01191 unsigned int i, istart; 01192 unsigned char *cursrc1, *cursrc2, *curdst; 01193 int result; 01194 01195 /* Validate input parameters */ 01196 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01197 return(-1); 01198 if (length == 0) 01199 return(0); 01200 01201 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01202 /* MMX routine */ 01203 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length); 01204 01205 /* Check for unaligned bytes */ 01206 if ((length & 7) > 0) { 01207 /* Setup to process unaligned bytes */ 01208 istart = length & 0xfffffff8; 01209 cursrc1 = &Src1[istart]; 01210 cursrc2 = &Src2[istart]; 01211 curdst = &Dest[istart]; 01212 } else { 01213 /* No unaligned bytes - we are done */ 01214 return (0); 01215 } 01216 } else { 01217 /* Setup to process whole image */ 01218 istart = 0; 01219 cursrc1 = Src1; 01220 cursrc2 = Src2; 01221 curdst = Dest; 01222 } 01223 01224 /* C routine to process image */ 01225 for (i = istart; i < length; i++) { 01226 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2); 01227 if (result > 255) 01228 result = 255; 01229 *curdst = (unsigned char) result; 01230 /* Advance pointers */ 01231 cursrc1++; 01232 cursrc2++; 01233 curdst++; 01234 } 01235 01236 return (0); 01237 } 01238 01249 int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01250 { 01251 #ifdef USE_MMX 01252 #if !defined(GCC__) 01253 __asm 01254 { 01255 pusha 01256 mov eax, Src1 /* load Src1 address into eax */ 01257 mov ebx, Src2 /* load Src2 address into ebx */ 01258 mov edi, Dest /* load Dest address into edi */ 01259 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01260 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01261 align 16 /* 16 byte alignment of the loop entry */ 01262 L1017: 01263 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01264 pand mm1, [ebx] /* mm1=Src1&Src2 */ 01265 movq [edi], mm1 /* store result in Dest */ 01266 add eax, 8 /* increase Src1, Src2 and Dest */ 01267 add ebx, 8 /* register pointers by 8 */ 01268 add edi, 8 01269 dec ecx /* decrease loop counter */ 01270 jnz L1017 /* check loop termination, proceed if required */ 01271 emms /* exit MMX state */ 01272 popa 01273 } 01274 #else 01275 asm volatile 01276 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 01277 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 01278 "mov %0, %%edi \n\t" /* load Dest address into edi */ 01279 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 01280 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 01281 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01282 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 01283 "pand (%%ebx), %%mm1 \n\t" /* mm1=Src1&Src2 */ 01284 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 01285 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 01286 "add $8, %%ebx \n\t" /* register pointers by 8 */ 01287 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 01288 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01289 "emms \n\t" /* exit MMX state */ 01290 "popa \n\t":"=m" (Dest) /* %0 */ 01291 :"m"(Src2), /* %1 */ 01292 "m"(Src1), /* %2 */ 01293 "m"(SrcLength) /* %3 */ 01294 ); 01295 #endif 01296 return (0); 01297 #else 01298 return (-1); 01299 #endif 01300 } 01301 01312 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01313 { 01314 unsigned int i, istart; 01315 unsigned char *cursrc1, *cursrc2, *curdst; 01316 01317 /* Validate input parameters */ 01318 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01319 return(-1); 01320 if (length == 0) 01321 return(0); 01322 01323 if ((SDL_imageFilterMMXdetect()>0) && (length>7)) { 01324 /* if (length > 7) { */ 01325 /* Call MMX routine */ 01326 01327 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length); 01328 01329 /* Check for unaligned bytes */ 01330 if ((length & 7) > 0) { 01331 01332 /* Setup to process unaligned bytes */ 01333 istart = length & 0xfffffff8; 01334 cursrc1 = &Src1[istart]; 01335 cursrc2 = &Src2[istart]; 01336 curdst = &Dest[istart]; 01337 } else { 01338 /* No unaligned bytes - we are done */ 01339 return (0); 01340 } 01341 } else { 01342 /* Setup to process whole image */ 01343 istart = 0; 01344 cursrc1 = Src1; 01345 cursrc2 = Src2; 01346 curdst = Dest; 01347 } 01348 01349 /* C routine to process image */ 01350 for (i = istart; i < length; i++) { 01351 *curdst = (*cursrc1) & (*cursrc2); 01352 /* Advance pointers */ 01353 cursrc1++; 01354 cursrc2++; 01355 curdst++; 01356 } 01357 01358 return (0); 01359 } 01360 01371 int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01372 { 01373 #ifdef USE_MMX 01374 #if !defined(GCC__) 01375 __asm 01376 { 01377 pusha 01378 mov eax, Src1 /* load Src1 address into eax */ 01379 mov ebx, Src2 /* load Src2 address into ebx */ 01380 mov edi, Dest /* load Dest address into edi */ 01381 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01382 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01383 align 16 /* 16 byte alignment of the loop entry */ 01384 L91017: 01385 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01386 por mm1, [ebx] /* mm1=Src1|Src2 */ 01387 movq [edi], mm1 /* store result in Dest */ 01388 add eax, 8 /* increase Src1, Src2 and Dest */ 01389 add ebx, 8 /* register pointers by 8 */ 01390 add edi, 8 01391 dec ecx /* decrease loop counter */ 01392 jnz L91017 /* check loop termination, proceed if required */ 01393 emms /* exit MMX state */ 01394 popa 01395 } 01396 #else 01397 asm volatile 01398 ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */ 01399 "mov %1, %%ebx \n\t" /* load Src2 address into ebx */ 01400 "mov %0, %%edi \n\t" /* load Dest address into edi */ 01401 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 01402 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 01403 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01404 "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ 01405 "por (%%ebx), %%mm1 \n\t" /* mm1=Src1|Src2 */ 01406 "movq %%mm1, (%%edi) \n\t" /* store result in Dest */ 01407 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 01408 "add $8, %%ebx \n\t" /* register pointers by 8 */ 01409 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 01410 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01411 "emms \n\t" /* exit MMX state */ 01412 "popa \n\t":"=m" (Dest) /* %0 */ 01413 :"m"(Src2), /* %1 */ 01414 "m"(Src1), /* %2 */ 01415 "m"(SrcLength) /* %3 */ 01416 ); 01417 #endif 01418 return (0); 01419 #else 01420 return (-1); 01421 #endif 01422 } 01423 01434 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01435 { 01436 unsigned int i, istart; 01437 unsigned char *cursrc1, *cursrc2, *curdst; 01438 01439 /* Validate input parameters */ 01440 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01441 return(-1); 01442 if (length == 0) 01443 return(0); 01444 01445 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01446 01447 /* MMX routine */ 01448 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length); 01449 01450 /* Check for unaligned bytes */ 01451 if ((length & 7) > 0) { 01452 /* Setup to process unaligned bytes */ 01453 istart = length & 0xfffffff8; 01454 cursrc1 = &Src1[istart]; 01455 cursrc2 = &Src2[istart]; 01456 curdst = &Dest[istart]; 01457 } else { 01458 /* No unaligned bytes - we are done */ 01459 return (0); 01460 } 01461 } else { 01462 /* Setup to process whole image */ 01463 istart = 0; 01464 cursrc1 = Src1; 01465 cursrc2 = Src2; 01466 curdst = Dest; 01467 } 01468 01469 /* C routine to process image */ 01470 for (i = istart; i < length; i++) { 01471 *curdst = *cursrc1 | *cursrc2; 01472 /* Advance pointers */ 01473 cursrc1++; 01474 cursrc2++; 01475 curdst++; 01476 } 01477 return (0); 01478 } 01479 01490 int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01491 { 01492 #ifdef USE_MMX 01493 #if !defined(GCC__) 01494 __asm 01495 { 01496 pusha 01497 mov edx, Src1 /* load Src1 address into edx */ 01498 mov esi, Src2 /* load Src2 address into esi */ 01499 mov edi, Dest /* load Dest address into edi */ 01500 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01501 align 16 /* 16 byte alignment of the loop entry */ 01502 L10191: 01503 mov bl, [esi] /* load a byte from Src2 */ 01504 cmp bl, 0 /* check if it zero */ 01505 jnz L10192 01506 mov [edi], 255 /* division by zero = 255 !!! */ 01507 jmp L10193 01508 L10192: 01509 xor ah, ah /* prepare AX, zero AH register */ 01510 mov al, [edx] /* load a byte from Src1 into AL */ 01511 div bl /* divide AL by BL */ 01512 mov [edi], al /* move a byte result to Dest */ 01513 L10193: 01514 inc edx /* increment Src1, Src2, Dest */ 01515 inc esi /* pointer registers by one */ 01516 inc edi 01517 dec ecx /* decrease loop counter */ 01518 jnz L10191 /* check loop termination, proceed if required */ 01519 popa 01520 } 01521 #else 01522 asm volatile 01523 ("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */ 01524 "mov %1, %%esi \n\t" /* load Src2 address into esi */ 01525 "mov %0, %%edi \n\t" /* load Dest address into edi */ 01526 "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 01527 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01528 "1: mov (%%esi), %%bl \n\t" /* load a byte from Src2 */ 01529 "cmp $0, %%bl \n\t" /* check if it zero */ 01530 "jnz 2f \n\t" "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */ 01531 "jmp 3f \n\t" "2: \n\t" "xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */ 01532 "mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */ 01533 "div %%bl \n\t" /* divide AL by BL */ 01534 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ 01535 "3: inc %%edx \n\t" /* increment Src1, Src2, Dest */ 01536 "inc %%esi \n\t" /* pointer registers by one */ 01537 "inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 01538 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01539 "popa \n\t":"=m" (Dest) /* %0 */ 01540 :"m"(Src2), /* %1 */ 01541 "m"(Src1), /* %2 */ 01542 "m"(SrcLength) /* %3 */ 01543 ); 01544 #endif 01545 return (0); 01546 #else 01547 return (-1); 01548 #endif 01549 } 01550 01561 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01562 { 01563 unsigned int i, istart; 01564 unsigned char *cursrc1, *cursrc2, *curdst; 01565 int result; 01566 01567 /* Validate input parameters */ 01568 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01569 return(-1); 01570 if (length == 0) 01571 return(0); 01572 01573 if (SDL_imageFilterMMXdetect()) { 01574 if (length > 0) { 01575 /* Call ASM routine */ 01576 SDL_imageFilterDivASM(Src1, Src2, Dest, length); 01577 01578 /* Never unaligned bytes - we are done */ 01579 return (0); 01580 } else { 01581 return (-1); 01582 } 01583 } 01584 01585 /* Setup to process whole image */ 01586 istart = 0; 01587 cursrc1 = Src1; 01588 cursrc2 = Src2; 01589 curdst = Dest; 01590 01591 /* C routine to process image */ 01592 for (i = istart; i < length; i++) { 01593 result = (int) *cursrc1 / (int) *cursrc2; 01594 *curdst = (unsigned char) result; 01595 /* Advance pointers */ 01596 cursrc1++; 01597 cursrc2++; 01598 curdst++; 01599 } 01600 01601 return (0); 01602 } 01603 01604 /* ------------------------------------------------------------------------------------ */ 01605 01615 int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength) 01616 { 01617 #ifdef USE_MMX 01618 #if !defined(GCC__) 01619 __asm 01620 { 01621 pusha 01622 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 01623 mov eax, Src1 /* load Src1 address into eax */ 01624 mov edi, Dest /* load Dest address into edi */ 01625 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01626 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01627 align 16 /* 16 byte alignment of the loop entry */ 01628 L91117: 01629 movq mm0, [eax] /* load 8 bytes from Src1 into mm1 */ 01630 pxor mm0, mm1 /* negate mm0 by xoring with mm1 */ 01631 movq [edi], mm0 /* store result in Dest */ 01632 add eax, 8 /* increase Src1, Src2 and Dest */ 01633 add edi, 8 01634 dec ecx /* decrease loop counter */ 01635 jnz L91117 /* check loop termination, proceed if required */ 01636 emms /* exit MMX state */ 01637 popa 01638 } 01639 #else 01640 asm volatile 01641 ("pusha \n\t" "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ 01642 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 01643 "mov %0, %%edi \n\t" /* load Dest address into edi */ 01644 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 01645 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 01646 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01647 "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into mm1 */ 01648 "pxor %%mm1, %%mm0 \n\t" /* negate mm0 by xoring with mm1 */ 01649 "movq %%mm0, (%%edi) \n\t" /* store result in Dest */ 01650 "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ 01651 "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ 01652 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01653 "emms \n\t" /* exit MMX state */ 01654 "popa \n\t":"=m" (Dest) /* %0 */ 01655 :"m"(Src1), /* %1 */ 01656 "m"(SrcLength) /* %2 */ 01657 ); 01658 #endif 01659 return (0); 01660 #else 01661 return (-1); 01662 #endif 01663 } 01664 01674 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length) 01675 { 01676 unsigned int i, istart; 01677 unsigned char *cursrc1, *curdst; 01678 01679 /* Validate input parameters */ 01680 if ((Src1 == NULL) || (Dest == NULL)) 01681 return(-1); 01682 if (length == 0) 01683 return(0); 01684 01685 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01686 /* MMX routine */ 01687 SDL_imageFilterBitNegationMMX(Src1, Dest, length); 01688 01689 /* Check for unaligned bytes */ 01690 if ((length & 7) > 0) { 01691 /* Setup to process unaligned bytes */ 01692 istart = length & 0xfffffff8; 01693 cursrc1 = &Src1[istart]; 01694 curdst = &Dest[istart]; 01695 } else { 01696 /* No unaligned bytes - we are done */ 01697 return (0); 01698 } 01699 } else { 01700 /* Setup to process whole image */ 01701 istart = 0; 01702 cursrc1 = Src1; 01703 curdst = Dest; 01704 } 01705 01706 /* C routine to process image */ 01707 for (i = istart; i < length; i++) { 01708 *curdst = ~(*cursrc1); 01709 /* Advance pointers */ 01710 cursrc1++; 01711 curdst++; 01712 } 01713 01714 return (0); 01715 } 01716 01727 int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 01728 { 01729 #ifdef USE_MMX 01730 #if !defined(GCC__) 01731 __asm 01732 { 01733 pusha 01734 /* ** Duplicate C in 8 bytes of MM1 ** */ 01735 mov al, C /* load C into AL */ 01736 mov ah, al /* copy AL into AH */ 01737 mov bx, ax /* copy AX into BX */ 01738 shl eax, 16 /* shift 2 bytes of EAX left */ 01739 mov ax, bx /* copy BX into AX */ 01740 movd mm1, eax /* copy EAX into MM1 */ 01741 movd mm2, eax /* copy EAX into MM2 */ 01742 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 01743 mov eax, Src1 /* load Src1 address into eax */ 01744 mov edi, Dest /* load Dest address into edi */ 01745 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01746 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01747 align 16 /* 16 byte alignment of the loop entry */ 01748 L1021: 01749 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 01750 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01751 movq [edi], mm0 /* store result in Dest */ 01752 add eax, 8 /* increase Dest register pointer by 8 */ 01753 add edi, 8 /* increase Dest register pointer by 8 */ 01754 dec ecx /* decrease loop counter */ 01755 jnz L1021 /* check loop termination, proceed if required */ 01756 emms /* exit MMX state */ 01757 popa 01758 } 01759 #else 01760 asm volatile 01761 ("pusha \n\t" 01762 /* ** Duplicate C in 8 bytes of MM1 ** */ 01763 "mov %3, %%al \n\t" /* load C into AL */ 01764 "mov %%al, %%ah \n\t" /* copy AL into AH */ 01765 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 01766 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 01767 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 01768 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 01769 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 01770 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */ 01771 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 01772 "mov %0, %%edi \n\t" /* load Dest address into edi */ 01773 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 01774 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 01775 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01776 "1: \n\t" 01777 "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ 01778 "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01779 "movq %%mm0, (%%edi) \n\t" /* store result in Dest */ 01780 "add $8, %%eax \n\t" /* increase Dest register pointer by 8 */ 01781 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 01782 "dec %%ecx \n\t" /* decrease loop counter */ 01783 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01784 "emms \n\t" /* exit MMX state */ 01785 "popa \n\t":"=m" (Dest) /* %0 */ 01786 :"m"(Src1), /* %1 */ 01787 "m"(SrcLength), /* %2 */ 01788 "m"(C) /* %3 */ 01789 ); 01790 #endif 01791 return (0); 01792 #else 01793 return (-1); 01794 #endif 01795 } 01796 01808 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 01809 { 01810 unsigned int i, istart; 01811 int iC; 01812 unsigned char *cursrc1, *curdest; 01813 int result; 01814 01815 /* Validate input parameters */ 01816 if ((Src1 == NULL) || (Dest == NULL)) 01817 return(-1); 01818 if (length == 0) 01819 return(0); 01820 01821 /* Special case: C==0 */ 01822 if (C == 0) { 01823 memcpy(Src1, Dest, length); 01824 return (0); 01825 } 01826 01827 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01828 01829 /* MMX routine */ 01830 SDL_imageFilterAddByteMMX(Src1, Dest, length, C); 01831 01832 /* Check for unaligned bytes */ 01833 if ((length & 7) > 0) { 01834 /* Setup to process unaligned bytes */ 01835 istart = length & 0xfffffff8; 01836 cursrc1 = &Src1[istart]; 01837 curdest = &Dest[istart]; 01838 } else { 01839 /* No unaligned bytes - we are done */ 01840 return (0); 01841 } 01842 } else { 01843 /* Setup to process whole image */ 01844 istart = 0; 01845 cursrc1 = Src1; 01846 curdest = Dest; 01847 } 01848 01849 /* C routine to process image */ 01850 iC = (int) C; 01851 for (i = istart; i < length; i++) { 01852 result = (int) *cursrc1 + iC; 01853 if (result > 255) 01854 result = 255; 01855 *curdest = (unsigned char) result; 01856 /* Advance pointers */ 01857 cursrc1++; 01858 curdest++; 01859 } 01860 return (0); 01861 } 01862 01874 int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D) 01875 { 01876 #ifdef USE_MMX 01877 #if !defined(GCC__) 01878 __asm 01879 { 01880 pusha 01881 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 01882 mov eax, C /* load C into EAX */ 01883 movd mm1, eax /* copy EAX into MM1 */ 01884 mov eax, D /* load D into EAX */ 01885 movd mm2, eax /* copy EAX into MM2 */ 01886 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 01887 mov eax, Src1 /* load Src1 address into eax */ 01888 mov edi, Dest /* load Dest address into edi */ 01889 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01890 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01891 align 16 /* 16 byte alignment of the loop entry */ 01892 L11023: 01893 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 01894 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01895 movq [edi], mm0 /* store result in SrcDest */ 01896 add eax, 8 /* increase Src1 register pointer by 8 */ 01897 add edi, 8 /* increase Dest register pointer by 8 */ 01898 dec ecx /* decrease loop counter */ 01899 jnz L11023 /* check loop termination, proceed if required */ 01900 emms /* exit MMX state */ 01901 popa 01902 } 01903 #else 01904 asm volatile 01905 ("pusha \n\t" 01906 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 01907 "mov %3, %%eax \n\t" /* load C into EAX */ 01908 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 01909 "mov %4, %%eax \n\t" /* load D into EAX */ 01910 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 01911 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */ 01912 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 01913 "mov %0, %%edi \n\t" /* load Dest address into edi */ 01914 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 01915 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 01916 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01917 "1: \n\t" 01918 "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ 01919 "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01920 "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ 01921 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 01922 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 01923 "dec %%ecx \n\t" /* decrease loop counter */ 01924 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01925 "emms \n\t" /* exit MMX state */ 01926 "popa \n\t":"=m" (Dest) /* %0 */ 01927 :"m"(Src1), /* %1 */ 01928 "m"(SrcLength), /* %2 */ 01929 "m"(C), /* %3 */ 01930 "m"(D) /* %4 */ 01931 ); 01932 #endif 01933 return (0); 01934 #else 01935 return (-1); 01936 #endif 01937 } 01938 01949 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C) 01950 { 01951 unsigned int i, j, istart, D; 01952 int iC[4]; 01953 unsigned char *cursrc1; 01954 unsigned char *curdest; 01955 int result; 01956 01957 /* Validate input parameters */ 01958 if ((Src1 == NULL) || (Dest == NULL)) 01959 return(-1); 01960 if (length == 0) 01961 return(0); 01962 01963 /* Special case: C==0 */ 01964 if (C == 0) { 01965 memcpy(Src1, Dest, length); 01966 return (0); 01967 } 01968 01969 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01970 01971 /* MMX routine */ 01972 D=SWAP_32(C); 01973 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D); 01974 01975 /* Check for unaligned bytes */ 01976 if ((length & 7) > 0) { 01977 /* Setup to process unaligned bytes */ 01978 istart = length & 0xfffffff8; 01979 cursrc1 = &Src1[istart]; 01980 curdest = &Dest[istart]; 01981 } else { 01982 /* No unaligned bytes - we are done */ 01983 return (0); 01984 } 01985 } else { 01986 /* Setup to process whole image */ 01987 istart = 0; 01988 cursrc1 = Src1; 01989 curdest = Dest; 01990 } 01991 01992 /* C routine to process bytes */ 01993 iC[3] = (int) ((C >> 24) & 0xff); 01994 iC[2] = (int) ((C >> 16) & 0xff); 01995 iC[1] = (int) ((C >> 8) & 0xff); 01996 iC[0] = (int) ((C >> 0) & 0xff); 01997 for (i = istart; i < length; i += 4) { 01998 for (j = 0; j < 4; j++) { 01999 if ((i+j)<length) { 02000 result = (int) *cursrc1 + iC[j]; 02001 if (result > 255) result = 255; 02002 *curdest = (unsigned char) result; 02003 /* Advance pointers */ 02004 cursrc1++; 02005 curdest++; 02006 } 02007 } 02008 } 02009 return (0); 02010 } 02011 02023 int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C, 02024 unsigned char *Mask) 02025 { 02026 #ifdef USE_MMX 02027 #if !defined(GCC__) 02028 __asm 02029 { 02030 pusha 02031 /* ** Duplicate C in 8 bytes of MM1 ** */ 02032 mov al, C /* load C into AL */ 02033 mov ah, al /* copy AL into AH */ 02034 mov bx, ax /* copy AX into BX */ 02035 shl eax, 16 /* shift 2 bytes of EAX left */ 02036 mov ax, bx /* copy BX into AX */ 02037 movd mm1, eax /* copy EAX into MM1 */ 02038 movd mm2, eax /* copy EAX into MM2 */ 02039 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02040 mov edx, Mask /* load Mask address into edx */ 02041 movq mm0, [edx] /* load Mask into mm0 */ 02042 mov eax, Src1 /* load Src1 address into eax */ 02043 mov edi, Dest /* load Dest address into edi */ 02044 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02045 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02046 align 16 /* 16 byte alignment of the loop entry */ 02047 L1022: 02048 movq mm2, [eax] /* load 8 bytes from Src1 into MM2 */ 02049 psrlw mm2, 1 /* shift 4 WORDS of MM2 1 bit to the right */ 02050 pand mm2, mm0 // apply Mask to 8 BYTES of MM2 */ 02051 /* byte 0x0f, 0xdb, 0xd0 */ 02052 paddusb mm2, mm1 /* MM2=SrcDest+C (add 8 bytes with saturation) */ 02053 movq [edi], mm2 /* store result in Dest */ 02054 add eax, 8 /* increase Src1 register pointer by 8 */ 02055 add edi, 8 /* increase Dest register pointer by 8 */ 02056 dec ecx /* decrease loop counter */ 02057 jnz L1022 /* check loop termination, proceed if required */ 02058 emms /* exit MMX state */ 02059 popa 02060 } 02061 #else 02062 asm volatile 02063 ("pusha \n\t" 02064 /* ** Duplicate C in 8 bytes of MM1 ** */ 02065 "mov %3, %%al \n\t" /* load C into AL */ 02066 "mov %%al, %%ah \n\t" /* copy AL into AH */ 02067 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 02068 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 02069 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 02070 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 02071 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 02072 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */ 02073 "movl %4, %%edx \n\t" /* load Mask address into edx */ 02074 "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ 02075 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 02076 "mov %0, %%edi \n\t" /* load Dest address into edi */ 02077 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 02078 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 02079 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 02080 "1: \n\t" 02081 "movq (%%eax), %%mm2 \n\t" /* load 8 bytes from Src1 into MM2 */ 02082 "psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of MM2 1 bit to the right */ 02083 /* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2 */ 02084 ".byte 0x0f, 0xdb, 0xd0 \n\t" 02085 "paddusb %%mm1, %%mm2 \n\t" /* MM2=SrcDest+C (add 8 bytes with saturation) */ 02086 "movq %%mm2, (%%edi) \n\t" /* store result in Dest */ 02087 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 02088 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 02089 "dec %%ecx \n\t" /* decrease loop counter */ 02090 "jnz 1b \n\t" /* check loop termination, proceed if required */ 02091 "emms \n\t" /* exit MMX state */ 02092 "popa \n\t":"=m" (Dest) /* %0 */ 02093 :"m"(Src1), /* %1 */ 02094 "m"(SrcLength), /* %2 */ 02095 "m"(C), /* %3 */ 02096 "m"(Mask) /* %4 */ 02097 ); 02098 #endif 02099 return (0); 02100 #else 02101 return (-1); 02102 #endif 02103 } 02104 02115 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02116 { 02117 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 02118 unsigned int i, istart; 02119 int iC; 02120 unsigned char *cursrc1; 02121 unsigned char *curdest; 02122 int result; 02123 02124 /* Validate input parameters */ 02125 if ((Src1 == NULL) || (Dest == NULL)) 02126 return(-1); 02127 if (length == 0) 02128 return(0); 02129 02130 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02131 02132 /* MMX routine */ 02133 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask); 02134 02135 /* Check for unaligned bytes */ 02136 if ((length & 7) > 0) { 02137 /* Setup to process unaligned bytes */ 02138 istart = length & 0xfffffff8; 02139 cursrc1 = &Src1[istart]; 02140 curdest = &Dest[istart]; 02141 } else { 02142 /* No unaligned bytes - we are done */ 02143 return (0); 02144 } 02145 } else { 02146 /* Setup to process whole image */ 02147 istart = 0; 02148 cursrc1 = Src1; 02149 curdest = Dest; 02150 } 02151 02152 /* C routine to process image */ 02153 iC = (int) C; 02154 for (i = istart; i < length; i++) { 02155 result = (int) (*cursrc1 / 2) + iC; 02156 if (result > 255) 02157 result = 255; 02158 *curdest = (unsigned char) result; 02159 /* Advance pointers */ 02160 cursrc1++; 02161 curdest++; 02162 } 02163 02164 return (0); 02165 } 02166 02177 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 02178 { 02179 #ifdef USE_MMX 02180 #if !defined(GCC__) 02181 __asm 02182 { 02183 pusha 02184 /* ** Duplicate C in 8 bytes of MM1 ** */ 02185 mov al, C /* load C into AL */ 02186 mov ah, al /* copy AL into AH */ 02187 mov bx, ax /* copy AX into BX */ 02188 shl eax, 16 /* shift 2 bytes of EAX left */ 02189 mov ax, bx /* copy BX into AX */ 02190 movd mm1, eax /* copy EAX into MM1 */ 02191 movd mm2, eax /* copy EAX into MM2 */ 02192 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02193 mov eax, Src1 /* load Src1 address into eax */ 02194 mov edi, Dest /* load Dest address into edi */ 02195 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02196 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02197 align 16 /* 16 byte alignment of the loop entry */ 02198 L1023: 02199 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02200 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02201 movq [edi], mm0 /* store result in SrcDest */ 02202 add eax, 8 /* increase Src1 register pointer by 8 */ 02203 add edi, 8 /* increase Dest register pointer by 8 */ 02204 dec ecx /* decrease loop counter */ 02205 jnz L1023 /* check loop termination, proceed if required */ 02206 emms /* exit MMX state */ 02207 popa 02208 } 02209 #else 02210 asm volatile 02211 ("pusha \n\t" 02212 /* ** Duplicate C in 8 bytes of MM1 ** */ 02213 "mov %3, %%al \n\t" /* load C into AL */ 02214 "mov %%al, %%ah \n\t" /* copy AL into AH */ 02215 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 02216 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 02217 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 02218 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 02219 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 02220 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */ 02221 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 02222 "mov %0, %%edi \n\t" /* load Dest address into edi */ 02223 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 02224 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 02225 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 02226 "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ 02227 "psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02228 "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ 02229 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 02230 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 02231 "dec %%ecx \n\t" /* decrease loop counter */ 02232 "jnz 1b \n\t" /* check loop termination, proceed if required */ 02233 "emms \n\t" /* exit MMX state */ 02234 "popa \n\t":"=m" (Dest) /* %0 */ 02235 :"m"(Src1), /* %1 */ 02236 "m"(SrcLength), /* %2 */ 02237 "m"(C) /* %3 */ 02238 ); 02239 #endif 02240 return (0); 02241 #else 02242 return (-1); 02243 #endif 02244 } 02245 02256 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02257 { 02258 unsigned int i, istart; 02259 int iC; 02260 unsigned char *cursrc1; 02261 unsigned char *curdest; 02262 int result; 02263 02264 /* Validate input parameters */ 02265 if ((Src1 == NULL) || (Dest == NULL)) 02266 return(-1); 02267 if (length == 0) 02268 return(0); 02269 02270 /* Special case: C==0 */ 02271 if (C == 0) { 02272 memcpy(Src1, Dest, length); 02273 return (0); 02274 } 02275 02276 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02277 02278 /* MMX routine */ 02279 SDL_imageFilterSubByteMMX(Src1, Dest, length, C); 02280 02281 /* Check for unaligned bytes */ 02282 if ((length & 7) > 0) { 02283 /* Setup to process unaligned bytes */ 02284 istart = length & 0xfffffff8; 02285 cursrc1 = &Src1[istart]; 02286 curdest = &Dest[istart]; 02287 } else { 02288 /* No unaligned bytes - we are done */ 02289 return (0); 02290 } 02291 } else { 02292 /* Setup to process whole image */ 02293 istart = 0; 02294 cursrc1 = Src1; 02295 curdest = Dest; 02296 } 02297 02298 /* C routine to process image */ 02299 iC = (int) C; 02300 for (i = istart; i < length; i++) { 02301 result = (int) *cursrc1 - iC; 02302 if (result < 0) 02303 result = 0; 02304 *curdest = (unsigned char) result; 02305 /* Advance pointers */ 02306 cursrc1++; 02307 curdest++; 02308 } 02309 return (0); 02310 } 02311 02323 int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D) 02324 { 02325 #ifdef USE_MMX 02326 #if !defined(GCC__) 02327 __asm 02328 { 02329 pusha 02330 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 02331 mov eax, C /* load C into EAX */ 02332 movd mm1, eax /* copy EAX into MM1 */ 02333 mov eax, D /* load D into EAX */ 02334 movd mm2, eax /* copy EAX into MM2 */ 02335 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02336 mov eax, Src1 /* load Src1 address into eax */ 02337 mov edi, Dest /* load Dest address into edi */ 02338 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02339 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02340 align 16 /* 16 byte alignment of the loop entry */ 02341 L11024: 02342 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02343 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02344 movq [edi], mm0 /* store result in SrcDest */ 02345 add eax, 8 /* increase Src1 register pointer by 8 */ 02346 add edi, 8 /* increase Dest register pointer by 8 */ 02347 dec ecx /* decrease loop counter */ 02348 jnz L11024 /* check loop termination, proceed if required */ 02349 emms /* exit MMX state */ 02350 popa 02351 } 02352 #else 02353 asm volatile 02354 ("pusha \n\t" 02355 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 02356 "mov %3, %%eax \n\t" /* load C into EAX */ 02357 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 02358 "mov %4, %%eax \n\t" /* load D into EAX */ 02359 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 02360 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */ 02361 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 02362 "mov %0, %%edi \n\t" /* load Dest address into edi */ 02363 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 02364 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 02365 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 02366 "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ 02367 "psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02368 "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ 02369 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 02370 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 02371 "dec %%ecx \n\t" /* decrease loop counter */ 02372 "jnz 1b \n\t" /* check loop termination, proceed if required */ 02373 "emms \n\t" /* exit MMX state */ 02374 "popa \n\t":"=m" (Dest) /* %0 */ 02375 :"m"(Src1), /* %1 */ 02376 "m"(SrcLength), /* %2 */ 02377 "m"(C), /* %3 */ 02378 "m"(D) /* %4 */ 02379 ); 02380 #endif 02381 return (0); 02382 #else 02383 return (-1); 02384 #endif 02385 } 02386 02397 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C) 02398 { 02399 unsigned int i, j, istart, D; 02400 int iC[4]; 02401 unsigned char *cursrc1; 02402 unsigned char *curdest; 02403 int result; 02404 02405 /* Validate input parameters */ 02406 if ((Src1 == NULL) || (Dest == NULL)) 02407 return(-1); 02408 if (length == 0) 02409 return(0); 02410 02411 /* Special case: C==0 */ 02412 if (C == 0) { 02413 memcpy(Src1, Dest, length); 02414 return (0); 02415 } 02416 02417 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02418 02419 /* MMX routine */ 02420 D=SWAP_32(C); 02421 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D); 02422 02423 /* Check for unaligned bytes */ 02424 if ((length & 7) > 0) { 02425 /* Setup to process unaligned bytes */ 02426 istart = length & 0xfffffff8; 02427 cursrc1 = &Src1[istart]; 02428 curdest = &Dest[istart]; 02429 } else { 02430 /* No unaligned bytes - we are done */ 02431 return (0); 02432 } 02433 } else { 02434 /* Setup to process whole image */ 02435 istart = 0; 02436 cursrc1 = Src1; 02437 curdest = Dest; 02438 } 02439 02440 /* C routine to process image */ 02441 iC[3] = (int) ((C >> 24) & 0xff); 02442 iC[2] = (int) ((C >> 16) & 0xff); 02443 iC[1] = (int) ((C >> 8) & 0xff); 02444 iC[0] = (int) ((C >> 0) & 0xff); 02445 for (i = istart; i < length; i += 4) { 02446 for (j = 0; j < 4; j++) { 02447 if ((i+j)<length) { 02448 result = (int) *cursrc1 - iC[j]; 02449 if (result < 0) result = 0; 02450 *curdest = (unsigned char) result; 02451 /* Advance pointers */ 02452 cursrc1++; 02453 curdest++; 02454 } 02455 } 02456 } 02457 return (0); 02458 } 02459 02471 int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 02472 unsigned char *Mask) 02473 { 02474 #ifdef USE_MMX 02475 #if !defined(GCC__) 02476 __asm 02477 { 02478 pusha 02479 mov edx, Mask /* load Mask address into edx */ 02480 movq mm0, [edx] /* load Mask into mm0 */ 02481 xor ecx, ecx /* zero ECX */ 02482 mov cl, N /* load loop counter (N) into CL */ 02483 movd mm3, ecx /* copy (N) into MM3 */ 02484 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 02485 L10240: /* ** Prepare proper bit-Mask in MM1 ** */ 02486 psrlw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the right */ 02487 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */ 02488 /* byte 0x0f, 0xdb, 0xc8 */ 02489 dec cl /* decrease loop counter */ 02490 jnz L10240 /* check loop termination, proceed if required */ 02491 /* ** Shift all bytes of the image ** */ 02492 mov eax, Src1 /* load Src1 address into eax */ 02493 mov edi, Dest /* load Dest address into edi */ 02494 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02495 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02496 align 16 /* 16 byte alignment of the loop entry */ 02497 L10241: 02498 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02499 psrlw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the right */ 02500 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */ 02501 /* byte 0x0f, 0xdb, 0xc1 */ 02502 movq [edi], mm0 /* store result in SrcDest */ 02503 add eax, 8 /* increase Src1 register pointer by 8 */ 02504 add edi, 8 /* increase Dest register pointer by 8 */ 02505 dec ecx /* decrease loop counter */ 02506 jnz L10241 /* check loop termination, proceed if required */ 02507 emms /* exit MMX state */ 02508 popa 02509 } 02510 #else 02511 asm volatile 02512 ("pusha \n\t" "movl %4, %%edx \n\t" /* load Mask address into edx */ 02513 "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ 02514 "xor %%ecx, %%ecx \n\t" /* zero ECX */ 02515 "mov %3, %%cl \n\t" /* load loop counter (N) into CL */ 02516 "movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */ 02517 "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ 02518 "1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */ 02519 "psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the right */ 02520 /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */ 02521 ".byte 0x0f, 0xdb, 0xc8 \n\t" 02522 "dec %%cl \n\t" /* decrease loop counter */ 02523 "jnz 1b \n\t" /* check loop termination, proceed if required */ 02524 /* ** Shift all bytes of the image ** */ 02525 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 02526 "mov %0, %%edi \n\t" /* load Dest address into edi */ 02527 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 02528 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 02529 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 02530 "2: \n\t" 02531 "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ 02532 "psrlw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the right */ 02533 /* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */ 02534 ".byte 0x0f, 0xdb, 0xc1 \n\t" 02535 "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ 02536 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 02537 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 02538 "dec %%ecx \n\t" /* decrease loop counter */ 02539 "jnz 2b \n\t" /* check loop termination, proceed if required */ 02540 "emms \n\t" /* exit MMX state */ 02541 "popa \n\t":"=m" (Dest) /* %0 */ 02542 :"m"(Src1), /* %1 */ 02543 "m"(SrcLength), /* %2 */ 02544 "m"(N), /* %3 */ 02545 "m"(Mask) /* %4 */ 02546 ); 02547 #endif 02548 return (0); 02549 #else 02550 return (-1); 02551 #endif 02552 } 02553 02564 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 02565 { 02566 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 02567 unsigned int i, istart; 02568 unsigned char *cursrc1; 02569 unsigned char *curdest; 02570 02571 /* Validate input parameters */ 02572 if ((Src1 == NULL) || (Dest == NULL)) 02573 return(-1); 02574 if (length == 0) 02575 return(0); 02576 02577 /* Check shift */ 02578 if (N > 8) { 02579 return (-1); 02580 } 02581 02582 /* Special case: N==0 */ 02583 if (N == 0) { 02584 memcpy(Src1, Dest, length); 02585 return (0); 02586 } 02587 02588 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02589 02590 /* MMX routine */ 02591 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask); 02592 02593 /* Check for unaligned bytes */ 02594 if ((length & 7) > 0) { 02595 /* Setup to process unaligned bytes */ 02596 istart = length & 0xfffffff8; 02597 cursrc1 = &Src1[istart]; 02598 curdest = &Dest[istart]; 02599 } else { 02600 /* No unaligned bytes - we are done */ 02601 return (0); 02602 } 02603 } else { 02604 /* Setup to process whole image */ 02605 istart = 0; 02606 cursrc1 = Src1; 02607 curdest = Dest; 02608 } 02609 02610 /* C routine to process image */ 02611 for (i = istart; i < length; i++) { 02612 *curdest = (unsigned char) *cursrc1 >> N; 02613 /* Advance pointers */ 02614 cursrc1++; 02615 curdest++; 02616 } 02617 02618 return (0); 02619 } 02620 02631 int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 02632 { 02633 #ifdef USE_MMX 02634 #if !defined(GCC__) 02635 __asm 02636 { 02637 pusha 02638 mov eax, Src1 /* load Src1 address into eax */ 02639 mov edi, Dest /* load Dest address into edi */ 02640 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02641 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02642 align 16 /* 16 byte alignment of the loop entry */ 02643 L13023: 02644 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02645 psrld mm0, N 02646 movq [edi], mm0 /* store result in SrcDest */ 02647 add eax, 8 /* increase Src1 register pointer by 8 */ 02648 add edi, 8 /* increase Dest register pointer by 8 */ 02649 dec ecx /* decrease loop counter */ 02650 jnz L13023 /* check loop termination, proceed if required */ 02651 emms /* exit MMX state */ 02652 popa 02653 } 02654 #else 02655 asm volatile 02656 ("pusha \n\t" 02657 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 02658 "mov %0, %%edi \n\t" /* load Dest address into edi */ 02659 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 02660 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 02661 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 02662 "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ 02663 "psrld %3, %%mm0 \n\t" 02664 "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ 02665 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 02666 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 02667 "dec %%ecx \n\t" /* decrease loop counter */ 02668 "jnz 1b \n\t" /* check loop termination, proceed if required */ 02669 "emms \n\t" /* exit MMX state */ 02670 "popa \n\t":"=m" (Dest) /* %0 */ 02671 :"m"(Src1), /* %1 */ 02672 "m"(SrcLength), /* %2 */ 02673 "m"(N) /* %3 */ 02674 ); 02675 #endif 02676 return (0); 02677 #else 02678 return (-1); 02679 #endif 02680 } 02681 02692 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 02693 { 02694 unsigned int i, istart; 02695 unsigned char *cursrc1, *curdest; 02696 unsigned int *icursrc1, *icurdest; 02697 unsigned int result; 02698 02699 /* Validate input parameters */ 02700 if ((Src1 == NULL) || (Dest == NULL)) 02701 return(-1); 02702 if (length == 0) 02703 return(0); 02704 02705 if (N > 32) { 02706 return (-1); 02707 } 02708 02709 /* Special case: N==0 */ 02710 if (N == 0) { 02711 memcpy(Src1, Dest, length); 02712 return (0); 02713 } 02714 02715 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02716 02717 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N); 02718 02719 /* Check for unaligned bytes */ 02720 if ((length & 7) > 0) { 02721 /* Setup to process unaligned bytes */ 02722 istart = length & 0xfffffff8; 02723 cursrc1 = &Src1[istart]; 02724 curdest = &Dest[istart]; 02725 } else { 02726 /* No unaligned bytes - we are done */ 02727 return (0); 02728 } 02729 } else { 02730 /* Setup to process whole image */ 02731 istart = 0; 02732 cursrc1 = Src1; 02733 curdest = Dest; 02734 } 02735 02736 /* C routine to process image */ 02737 icursrc1=(unsigned int *)cursrc1; 02738 icurdest=(unsigned int *)curdest; 02739 for (i = istart; i < length; i += 4) { 02740 if ((i+4)<length) { 02741 result = ((unsigned int)*icursrc1 >> N); 02742 *icurdest = result; 02743 } 02744 /* Advance pointers */ 02745 icursrc1++; 02746 icurdest++; 02747 } 02748 02749 return (0); 02750 } 02751 02762 int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 02763 { 02764 #ifdef USE_MMX 02765 #if !defined(GCC__) 02766 __asm 02767 { 02768 pusha 02769 /* ** Duplicate C in 4 words of MM1 ** */ 02770 mov al, C /* load C into AL */ 02771 xor ah, ah /* zero AH */ 02772 mov bx, ax /* copy AX into BX */ 02773 shl eax, 16 /* shift 2 bytes of EAX left */ 02774 mov ax, bx /* copy BX into AX */ 02775 movd mm1, eax /* copy EAX into MM1 */ 02776 movd mm2, eax /* copy EAX into MM2 */ 02777 punpckldq mm1, mm2 /* fill higher words of MM1 with C */ 02778 pxor mm0, mm0 /* zero MM0 register */ 02779 mov eax, Src1 /* load Src1 address into eax */ 02780 mov edi, Dest /* load Dest address into edi */ 02781 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02782 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02783 cmp al, 128 /* if (C <= 128) execute more efficient code */ 02784 jg L10251 02785 align 16 /* 16 byte alignment of the loop entry */ 02786 L10250: 02787 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02788 movq mm4, mm3 /* copy MM3 into MM4 */ 02789 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02790 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02791 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */ 02792 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */ 02793 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02794 movq [edi], mm3 /* store result in Dest */ 02795 add eax, 8 /* increase Src1 register pointer by 8 */ 02796 add edi, 8 /* increase Dest register pointer by 8 */ 02797 dec ecx /* decrease loop counter */ 02798 jnz L10250 /* check loop termination, proceed if required */ 02799 jmp L10252 02800 align 16 /* 16 byte alignment of the loop entry */ 02801 L10251: 02802 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02803 movq mm4, mm3 /* copy MM3 into MM4 */ 02804 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02805 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02806 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */ 02807 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */ 02808 /* ** Take abs value of the results (signed words) ** */ 02809 movq mm5, mm3 /* copy mm3 into mm5 */ 02810 movq mm6, mm4 /* copy mm4 into mm6 */ 02811 psraw mm5, 15 /* fill mm5 words with word sign bit */ 02812 psraw mm6, 15 /* fill mm6 words with word sign bit */ 02813 pxor mm3, mm5 /* take 1's compliment of only neg words */ 02814 pxor mm4, mm6 /* take 1's compliment of only neg words */ 02815 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 02816 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 02817 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02818 movq [edi], mm3 /* store result in Dest */ 02819 add eax, 8 /* increase Src1 register pointer by 8 */ 02820 add edi, 8 /* increase Dest register pointer by 8 */ 02821 dec ecx /* decrease loop counter */ 02822 jnz L10251 /* check loop termination, proceed if required */ 02823 L10252: 02824 emms /* exit MMX state */ 02825 popa 02826 } 02827 #else 02828 asm volatile 02829 ("pusha \n\t" 02830 /* ** Duplicate C in 4 words of MM1 ** */ 02831 "mov %3, %%al \n\t" /* load C into AL */ 02832 "xor %%ah, %%ah \n\t" /* zero AH */ 02833 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 02834 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 02835 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 02836 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 02837 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 02838 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher words of MM1 with C */ 02839 "pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */ 02840 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 02841 "mov %0, %%edi \n\t" /* load Dest address into edi */ 02842 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 02843 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 02844 "cmp $128, %%al \n\t" /* if (C <= 128) execute more efficient code */ 02845 "jg 2f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 02846 "1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ 02847 "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ 02848 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ 02849 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ 02850 "pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */ 02851 "pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */ 02852 "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ 02853 "movq %%mm3, (%%edi) \n\t" /* store result in Dest */ 02854 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 02855 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 02856 "dec %%ecx \n\t" /* decrease loop counter */ 02857 "jnz 1b \n\t" /* check loop termination, proceed if required */ 02858 "jmp 3f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 02859 "2: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ 02860 "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ 02861 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ 02862 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ 02863 "pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */ 02864 "pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */ 02865 /* ** Take abs value of the results (signed words) ** */ 02866 "movq %%mm3, %%mm5 \n\t" /* copy mm3 into mm5 */ 02867 "movq %%mm4, %%mm6 \n\t" /* copy mm4 into mm6 */ 02868 "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */ 02869 "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */ 02870 "pxor %%mm5, %%mm3 \n\t" /* take 1's compliment of only neg. words */ 02871 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 02872 "psubsw %%mm5, %%mm3 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 02873 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 02874 "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ 02875 "movq %%mm3, (%%edi) \n\t" /* store result in Dest */ 02876 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 02877 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 02878 "dec %%ecx \n\t" /* decrease loop counter */ 02879 "jnz 2b \n\t" /* check loop termination, proceed if required */ 02880 "3: emms \n\t" /* exit MMX state */ 02881 "popa \n\t":"=m" (Dest) /* %0 */ 02882 :"m"(Src1), /* %1 */ 02883 "m"(SrcLength), /* %2 */ 02884 "m"(C) /* %3 */ 02885 ); 02886 #endif 02887 return (0); 02888 #else 02889 return (-1); 02890 #endif 02891 } 02892 02903 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02904 { 02905 unsigned int i, istart; 02906 int iC; 02907 unsigned char *cursrc1; 02908 unsigned char *curdest; 02909 int result; 02910 02911 /* Validate input parameters */ 02912 if ((Src1 == NULL) || (Dest == NULL)) 02913 return(-1); 02914 if (length == 0) 02915 return(0); 02916 02917 /* Special case: C==1 */ 02918 if (C == 1) { 02919 memcpy(Src1, Dest, length); 02920 return (0); 02921 } 02922 02923 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02924 02925 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C); 02926 02927 /* Check for unaligned bytes */ 02928 if ((length & 7) > 0) { 02929 /* Setup to process unaligned bytes */ 02930 istart = length & 0xfffffff8; 02931 cursrc1 = &Src1[istart]; 02932 curdest = &Dest[istart]; 02933 } else { 02934 /* No unaligned bytes - we are done */ 02935 return (0); 02936 } 02937 } else { 02938 /* Setup to process whole image */ 02939 istart = 0; 02940 cursrc1 = Src1; 02941 curdest = Dest; 02942 } 02943 02944 /* C routine to process image */ 02945 iC = (int) C; 02946 for (i = istart; i < length; i++) { 02947 result = (int) *cursrc1 * iC; 02948 if (result > 255) 02949 result = 255; 02950 *curdest = (unsigned char) result; 02951 /* Advance pointers */ 02952 cursrc1++; 02953 curdest++; 02954 } 02955 02956 return (0); 02957 } 02958 02970 int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 02971 unsigned char C) 02972 { 02973 #ifdef USE_MMX 02974 #if !defined(GCC__) 02975 __asm 02976 { 02977 pusha 02978 /* ** Duplicate C in 4 words of MM1 ** */ 02979 mov al, C /* load C into AL */ 02980 xor ah, ah /* zero AH */ 02981 mov bx, ax /* copy AX into BX */ 02982 shl eax, 16 /* shift 2 bytes of EAX left */ 02983 mov ax, bx /* copy BX into AX */ 02984 movd mm1, eax /* copy EAX into MM1 */ 02985 movd mm2, eax /* copy EAX into MM2 */ 02986 punpckldq mm1, mm2 /* fill higher words of MM1 with C */ 02987 xor ecx, ecx /* zero ECX */ 02988 mov cl, N /* load N into CL */ 02989 movd mm7, ecx /* copy N into MM7 */ 02990 pxor mm0, mm0 /* zero MM0 register */ 02991 mov eax, Src1 /* load Src1 address into eax */ 02992 mov edi, Dest /* load Dest address into edi */ 02993 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02994 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02995 align 16 /* 16 byte alignment of the loop entry */ 02996 L1026: 02997 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02998 movq mm4, mm3 /* copy MM3 into MM4 */ 02999 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 03000 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 03001 psrlw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */ 03002 psrlw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */ 03003 pmullw mm3, mm1 /* mul low bytes of SrcDest by MM1 */ 03004 pmullw mm4, mm1 /* mul high bytes of SrcDest by MM1 */ 03005 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03006 movq [edi], mm3 /* store result in Dest */ 03007 add eax, 8 /* increase Src1 register pointer by 8 */ 03008 add edi, 8 /* increase Dest register pointer by 8 */ 03009 dec ecx /* decrease loop counter */ 03010 jnz L1026 /* check loop termination, proceed if required */ 03011 emms /* exit MMX state */ 03012 popa 03013 } 03014 #else 03015 asm volatile 03016 ("pusha \n\t" 03017 /* ** Duplicate C in 4 words of MM1 ** */ 03018 "mov %4, %%al \n\t" /* load C into AL */ 03019 "xor %%ah, %%ah \n\t" /* zero AH */ 03020 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 03021 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 03022 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 03023 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 03024 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 03025 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher words of MM1 with C */ 03026 "xor %%ecx, %%ecx \n\t" /* zero ECX */ 03027 "mov %3, %%cl \n\t" /* load N into CL */ 03028 "movd %%ecx, %%mm7 \n\t" /* copy N into MM7 */ 03029 "pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */ 03030 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 03031 "mov %0, %%edi \n\t" /* load Dest address into edi */ 03032 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 03033 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 03034 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 03035 "1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ 03036 "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ 03037 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ 03038 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ 03039 "psrlw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ 03040 "psrlw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ 03041 "pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest by MM1 */ 03042 "pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest by MM1 */ 03043 "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ 03044 "movq %%mm3, (%%edi) \n\t" /* store result in Dest */ 03045 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 03046 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 03047 "dec %%ecx \n\t" /* decrease loop counter */ 03048 "jnz 1b \n\t" /* check loop termination, proceed if required */ 03049 "emms \n\t" /* exit MMX state */ 03050 "popa \n\t":"=m" (Dest) /* %0 */ 03051 :"m"(Src1), /* %1 */ 03052 "m"(SrcLength), /* %2 */ 03053 "m"(N), /* %3 */ 03054 "m"(C) /* %4 */ 03055 ); 03056 #endif 03057 return (0); 03058 #else 03059 return (-1); 03060 #endif 03061 } 03062 03074 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N, 03075 unsigned char C) 03076 { 03077 unsigned int i, istart; 03078 int iC; 03079 unsigned char *cursrc1; 03080 unsigned char *curdest; 03081 int result; 03082 03083 /* Validate input parameters */ 03084 if ((Src1 == NULL) || (Dest == NULL)) 03085 return(-1); 03086 if (length == 0) 03087 return(0); 03088 03089 /* Check shift */ 03090 if (N > 8) { 03091 return (-1); 03092 } 03093 03094 /* Special case: N==0 && C==1 */ 03095 if ((N == 0) && (C == 1)) { 03096 memcpy(Src1, Dest, length); 03097 return (0); 03098 } 03099 03100 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03101 03102 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C); 03103 03104 /* Check for unaligned bytes */ 03105 if ((length & 7) > 0) { 03106 /* Setup to process unaligned bytes */ 03107 istart = length & 0xfffffff8; 03108 cursrc1 = &Src1[istart]; 03109 curdest = &Dest[istart]; 03110 } else { 03111 /* No unaligned bytes - we are done */ 03112 return (0); 03113 } 03114 } else { 03115 /* Setup to process whole image */ 03116 istart = 0; 03117 cursrc1 = Src1; 03118 curdest = Dest; 03119 } 03120 03121 /* C routine to process image */ 03122 iC = (int) C; 03123 for (i = istart; i < length; i++) { 03124 result = (int) (*cursrc1 >> N) * iC; 03125 if (result > 255) 03126 result = 255; 03127 *curdest = (unsigned char) result; 03128 /* Advance pointers */ 03129 cursrc1++; 03130 curdest++; 03131 } 03132 03133 return (0); 03134 } 03135 03147 int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 03148 unsigned char *Mask) 03149 { 03150 #ifdef USE_MMX 03151 #if !defined(GCC__) 03152 __asm 03153 { 03154 pusha 03155 mov edx, Mask /* load Mask address into edx */ 03156 movq mm0, [edx] /* load Mask into mm0 */ 03157 xor ecx, ecx /* zero ECX */ 03158 mov cl, N /* load loop counter (N) into CL */ 03159 movd mm3, ecx /* copy (N) into MM3 */ 03160 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03161 L10270: /* ** Prepare proper bit-Mask in MM1 ** */ 03162 psllw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the left */ 03163 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */ 03164 /* byte 0x0f, 0xdb, 0xc8 */ 03165 dec cl /* decrease loop counter */ 03166 jnz L10270 /* check loop termination, proceed if required */ 03167 /* ** Shift all bytes of the image ** */ 03168 mov eax, Src1 /* load Src1 address into eax */ 03169 mov edi, Dest /* load SrcDest address into edi */ 03170 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03171 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03172 align 16 /* 16 byte alignment of the loop entry */ 03173 L10271: 03174 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 03175 psllw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the left */ 03176 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */ 03177 /* byte 0x0f, 0xdb, 0xc1 */ 03178 movq [edi], mm0 /* store result in Dest */ 03179 add eax, 8 /* increase Src1 register pointer by 8 */ 03180 add edi, 8 /* increase Dest register pointer by 8 */ 03181 dec ecx /* decrease loop counter */ 03182 jnz L10271 /* check loop termination, proceed if required */ 03183 emms /* exit MMX state */ 03184 popa 03185 } 03186 #else 03187 asm volatile 03188 ("pusha \n\t" "movl %4, %%edx \n\t" /* load Mask address into edx */ 03189 "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ 03190 "xor %%ecx, %%ecx \n\t" /* zero ECX */ 03191 "mov %3, %%cl \n\t" /* load loop counter (N) into CL */ 03192 "movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */ 03193 "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ 03194 "1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */ 03195 "psllw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the left */ 03196 /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */ 03197 ".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" /* decrease loop counter */ 03198 "jnz 1b \n\t" /* check loop termination, proceed if required */ 03199 /* ** Shift all bytes of the image ** */ 03200 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 03201 "mov %0, %%edi \n\t" /* load SrcDest address into edi */ 03202 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 03203 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 03204 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 03205 "2: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ 03206 "psllw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the left */ 03207 /* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */ 03208 ".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" /* store result in Dest */ 03209 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 03210 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 03211 "dec %%ecx \n\t" /* decrease loop counter */ 03212 "jnz 2b \n\t" /* check loop termination, proceed if required */ 03213 "emms \n\t" /* exit MMX state */ 03214 "popa \n\t":"=m" (Dest) /* %0 */ 03215 :"m"(Src1), /* %1 */ 03216 "m"(SrcLength), /* %2 */ 03217 "m"(N), /* %3 */ 03218 "m"(Mask) /* %4 */ 03219 ); 03220 #endif 03221 return (0); 03222 #else 03223 return (-1); 03224 #endif 03225 } 03226 03237 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03238 { 03239 static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE }; 03240 unsigned int i, istart; 03241 unsigned char *cursrc1, *curdest; 03242 int result; 03243 03244 /* Validate input parameters */ 03245 if ((Src1 == NULL) || (Dest == NULL)) 03246 return(-1); 03247 if (length == 0) 03248 return(0); 03249 03250 if (N > 8) { 03251 return (-1); 03252 } 03253 03254 /* Special case: N==0 */ 03255 if (N == 0) { 03256 memcpy(Src1, Dest, length); 03257 return (0); 03258 } 03259 03260 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03261 03262 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask); 03263 03264 /* Check for unaligned bytes */ 03265 if ((length & 7) > 0) { 03266 /* Setup to process unaligned bytes */ 03267 istart = length & 0xfffffff8; 03268 cursrc1 = &Src1[istart]; 03269 curdest = &Dest[istart]; 03270 } else { 03271 /* No unaligned bytes - we are done */ 03272 return (0); 03273 } 03274 } else { 03275 /* Setup to process whole image */ 03276 istart = 0; 03277 cursrc1 = Src1; 03278 curdest = Dest; 03279 } 03280 03281 /* C routine to process image */ 03282 for (i = istart; i < length; i++) { 03283 result = ((int) *cursrc1 << N) & 0xff; 03284 *curdest = (unsigned char) result; 03285 /* Advance pointers */ 03286 cursrc1++; 03287 curdest++; 03288 } 03289 03290 return (0); 03291 } 03292 03303 int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 03304 { 03305 #ifdef USE_MMX 03306 #if !defined(GCC__) 03307 __asm 03308 { 03309 pusha 03310 mov eax, Src1 /* load Src1 address into eax */ 03311 mov edi, Dest /* load Dest address into edi */ 03312 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03313 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03314 align 16 /* 16 byte alignment of the loop entry */ 03315 L12023: 03316 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 03317 pslld mm0, N /* MM0=SrcDest+C (add 8 bytes with saturation) */ 03318 movq [edi], mm0 /* store result in SrcDest */ 03319 add eax, 8 /* increase Src1 register pointer by 8 */ 03320 add edi, 8 /* increase Dest register pointer by 8 */ 03321 dec ecx /* decrease loop counter */ 03322 jnz L12023 /* check loop termination, proceed if required */ 03323 emms /* exit MMX state */ 03324 popa 03325 } 03326 #else 03327 asm volatile 03328 ("pusha \n\t" 03329 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 03330 "mov %0, %%edi \n\t" /* load Dest address into edi */ 03331 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 03332 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 03333 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 03334 "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ 03335 "pslld %3, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ 03336 "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ 03337 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 03338 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 03339 "dec %%ecx \n\t" /* decrease loop counter */ 03340 "jnz 1b \n\t" /* check loop termination, proceed if required */ 03341 "emms \n\t" /* exit MMX state */ 03342 "popa \n\t":"=m" (Dest) /* %0 */ 03343 :"m"(Src1), /* %1 */ 03344 "m"(SrcLength), /* %2 */ 03345 "m"(N) /* %3 */ 03346 ); 03347 #endif 03348 return (0); 03349 #else 03350 return (-1); 03351 #endif 03352 } 03353 03364 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03365 { 03366 unsigned int i, istart; 03367 unsigned char *cursrc1, *curdest; 03368 unsigned int *icursrc1, *icurdest; 03369 unsigned int result; 03370 03371 /* Validate input parameters */ 03372 if ((Src1 == NULL) || (Dest == NULL)) 03373 return(-1); 03374 if (length == 0) 03375 return(0); 03376 03377 if (N > 32) { 03378 return (-1); 03379 } 03380 03381 /* Special case: N==0 */ 03382 if (N == 0) { 03383 memcpy(Src1, Dest, length); 03384 return (0); 03385 } 03386 03387 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03388 03389 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N); 03390 03391 /* Check for unaligned bytes */ 03392 if ((length & 7) > 0) { 03393 /* Setup to process unaligned bytes */ 03394 istart = length & 0xfffffff8; 03395 cursrc1 = &Src1[istart]; 03396 curdest = &Dest[istart]; 03397 } else { 03398 /* No unaligned bytes - we are done */ 03399 return (0); 03400 } 03401 } else { 03402 /* Setup to process whole image */ 03403 istart = 0; 03404 cursrc1 = Src1; 03405 curdest = Dest; 03406 } 03407 03408 /* C routine to process image */ 03409 icursrc1=(unsigned int *)cursrc1; 03410 icurdest=(unsigned int *)curdest; 03411 for (i = istart; i < length; i += 4) { 03412 if ((i+4)<length) { 03413 result = ((unsigned int)*icursrc1 << N); 03414 *icurdest = result; 03415 } 03416 /* Advance pointers */ 03417 icursrc1++; 03418 icurdest++; 03419 } 03420 03421 return (0); 03422 } 03423 03434 int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 03435 { 03436 #ifdef USE_MMX 03437 #if !defined(GCC__) 03438 __asm 03439 { 03440 pusha 03441 xor eax, eax /* zero EAX */ 03442 mov al, N /* load N into AL */ 03443 movd mm7, eax /* copy N into MM7 */ 03444 pxor mm0, mm0 /* zero MM0 register */ 03445 mov eax, Src1 /* load Src1 address into eax */ 03446 mov edi, Dest /* load Dest address into edi */ 03447 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03448 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03449 cmp al, 7 /* if (N <= 7) execute more efficient code */ 03450 jg L10281 03451 align 16 /* 16 byte alignment of the loop entry */ 03452 L10280: 03453 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03454 movq mm4, mm3 /* copy MM3 into MM4 */ 03455 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 03456 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 03457 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */ 03458 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */ 03459 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03460 movq [edi], mm3 /* store result in Dest */ 03461 add eax, 8 /* increase Src1 register pointer by 8 */ 03462 add edi, 8 /* increase Dest register pointer by 8 */ 03463 dec ecx /* decrease loop counter */ 03464 jnz L10280 /* check loop termination, proceed if required */ 03465 jmp L10282 03466 align 16 /* 16 byte alignment of the loop entry */ 03467 L10281: 03468 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03469 movq mm4, mm3 /* copy MM3 into MM4 */ 03470 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 03471 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 03472 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */ 03473 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */ 03474 /* ** Take abs value of the signed words ** */ 03475 movq mm5, mm3 /* copy mm3 into mm5 */ 03476 movq mm6, mm4 /* copy mm4 into mm6 */ 03477 psraw mm5, 15 /* fill mm5 words with word sign bit */ 03478 psraw mm6, 15 /* fill mm6 words with word sign bit */ 03479 pxor mm3, mm5 /* take 1's compliment of only neg words */ 03480 pxor mm4, mm6 /* take 1's compliment of only neg words */ 03481 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 03482 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 03483 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03484 movq [edi], mm3 /* store result in Dest */ 03485 add eax, 8 /* increase Src1 register pointer by 8 */ 03486 add edi, 8 /* increase Dest register pointer by 8 */ 03487 dec ecx /* decrease loop counter */ 03488 jnz L10281 /* check loop termination, proceed if required */ 03489 L10282: 03490 emms /* exit MMX state */ 03491 popa 03492 } 03493 #else 03494 asm volatile 03495 ("pusha \n\t" "xor %%eax, %%eax \n\t" /* zero EAX */ 03496 "mov %3, %%al \n\t" /* load N into AL */ 03497 "movd %%eax, %%mm7 \n\t" /* copy N into MM7 */ 03498 "pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */ 03499 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 03500 "mov %0, %%edi \n\t" /* load Dest address into edi */ 03501 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 03502 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 03503 "cmp $7, %%al \n\t" /* if (N <= 7) execute more efficient code */ 03504 "jg 2f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 03505 "1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ 03506 "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ 03507 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ 03508 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ 03509 "psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ 03510 "psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ 03511 "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ 03512 "movq %%mm3, (%%edi) \n\t" /* store result in Dest */ 03513 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 03514 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 03515 "dec %%ecx \n\t" /* decrease loop counter */ 03516 "jnz 1b \n\t" /* check loop termination, proceed if required */ 03517 "jmp 3f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 03518 "2: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ 03519 "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ 03520 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ 03521 "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ 03522 "psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ 03523 "psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ 03524 /* ** Take abs value of the signed words ** */ 03525 "movq %%mm3, %%mm5 \n\t" /* copy mm3 into mm5 */ 03526 "movq %%mm4, %%mm6 \n\t" /* copy mm4 into mm6 */ 03527 "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */ 03528 "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */ 03529 "pxor %%mm5, %%mm3 \n\t" /* take 1's compliment of only neg. words */ 03530 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 03531 "psubsw %%mm5, %%mm3 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 03532 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 03533 "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ 03534 "movq %%mm3, (%%edi) \n\t" /* store result in Dest */ 03535 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 03536 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 03537 "dec %%ecx \n\t" /* decrease loop counter */ 03538 "jnz 2b \n\t" /* check loop termination, proceed if required */ 03539 "3: emms \n\t" /* exit MMX state */ 03540 "popa \n\t":"=m" (Dest) /* %0 */ 03541 :"m"(Src1), /* %1 */ 03542 "m"(SrcLength), /* %2 */ 03543 "m"(N) /* %3 */ 03544 ); 03545 #endif 03546 return (0); 03547 #else 03548 return (-1); 03549 #endif 03550 } 03551 03562 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03563 { 03564 unsigned int i, istart; 03565 unsigned char *cursrc1, *curdest; 03566 int result; 03567 03568 /* Validate input parameters */ 03569 if ((Src1 == NULL) || (Dest == NULL)) 03570 return(-1); 03571 if (length == 0) 03572 return(0); 03573 03574 if (N > 8) { 03575 return (-1); 03576 } 03577 03578 /* Special case: N==0 */ 03579 if (N == 0) { 03580 memcpy(Src1, Dest, length); 03581 return (0); 03582 } 03583 03584 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03585 03586 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N); 03587 03588 /* Check for unaligned bytes */ 03589 if ((length & 7) > 0) { 03590 /* Setup to process unaligned bytes */ 03591 istart = length & 0xfffffff8; 03592 cursrc1 = &Src1[istart]; 03593 curdest = &Dest[istart]; 03594 } else { 03595 /* No unaligned bytes - we are done */ 03596 return (0); 03597 } 03598 } else { 03599 /* Setup to process whole image */ 03600 istart = 0; 03601 cursrc1 = Src1; 03602 curdest = Dest; 03603 } 03604 03605 /* C routine to process image */ 03606 for (i = istart; i < length; i++) { 03607 result = (int) *cursrc1 << N; 03608 if (result > 255) 03609 result = 255; 03610 *curdest = (unsigned char) result; 03611 /* Advance pointers */ 03612 cursrc1++; 03613 curdest++; 03614 } 03615 03616 return (0); 03617 } 03618 03629 int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T) 03630 { 03631 #ifdef USE_MMX 03632 #if !defined(GCC__) 03633 __asm 03634 { 03635 pusha 03636 /* ** Duplicate T in 8 bytes of MM3 ** */ 03637 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03638 pcmpeqb mm2, mm2 /* generate all 1's in mm2 */ 03639 mov al, T /* load T into AL */ 03640 mov ah, al /* copy AL into AH */ 03641 mov bx, ax /* copy AX into BX */ 03642 shl eax, 16 /* shift 2 bytes of EAX left */ 03643 mov ax, bx /* copy BX into AX */ 03644 movd mm3, eax /* copy EAX into MM3 */ 03645 movd mm4, eax /* copy EAX into MM4 */ 03646 punpckldq mm3, mm4 /* fill higher bytes of MM3 with T */ 03647 psubusb mm2, mm3 /* store 0xFF - T in MM2 */ 03648 mov eax, Src1 /* load Src1 address into eax */ 03649 mov edi, Dest /* load Dest address into edi */ 03650 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03651 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03652 align 16 /* 16 byte alignment of the loop entry */ 03653 L1029: 03654 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 03655 paddusb mm0, mm2 /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */ 03656 pcmpeqb mm0, mm1 /* binarize 255:0, comparing to 255 */ 03657 movq [edi], mm0 /* store result in SrcDest */ 03658 add eax, 8 /* increase Src1 register pointer by 8 */ 03659 add edi, 8 /* increase Dest register pointer by 8 */ 03660 dec ecx /* decrease loop counter */ 03661 jnz L1029 /* check loop termination, proceed if required */ 03662 emms /* exit MMX state */ 03663 popa 03664 } 03665 #else 03666 asm volatile 03667 ("pusha \n\t" 03668 /* ** Duplicate T in 8 bytes of MM3 ** */ 03669 "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ 03670 "pcmpeqb %%mm2, %%mm2 \n\t" /* generate all 1's in mm2 */ 03671 "mov %3, %%al \n\t" /* load T into AL */ 03672 "mov %%al, %%ah \n\t" /* copy AL into AH */ 03673 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 03674 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 03675 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 03676 "movd %%eax, %%mm3 \n\t" /* copy EAX into MM3 */ 03677 "movd %%eax, %%mm4 \n\t" /* copy EAX into MM4 */ 03678 "punpckldq %%mm4, %%mm3 \n\t" /* fill higher bytes of MM3 with T */ 03679 "psubusb %%mm3, %%mm2 \n\t" /* store 0xFF - T in MM2 */ 03680 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 03681 "mov %0, %%edi \n\t" /* load Dest address into edi */ 03682 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 03683 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 03684 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 03685 "1: \n\t" 03686 "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ 03687 "paddusb %%mm2, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */ 03688 "pcmpeqb %%mm1, %%mm0 \n\t" /* binarize 255:0, comparing to 255 */ 03689 "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ 03690 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 03691 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 03692 "dec %%ecx \n\t" /* decrease loop counter */ 03693 "jnz 1b \n\t" /* check loop termination, proceed if required */ 03694 "emms \n\t" /* exit MMX state */ 03695 "popa \n\t":"=m" (Dest) /* %0 */ 03696 :"m"(Src1), /* %1 */ 03697 "m"(SrcLength), /* %2 */ 03698 "m"(T) /* %3 */ 03699 ); 03700 #endif 03701 return (0); 03702 #else 03703 return (-1); 03704 #endif 03705 } 03706 03717 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T) 03718 { 03719 unsigned int i, istart; 03720 unsigned char *cursrc1; 03721 unsigned char *curdest; 03722 03723 /* Validate input parameters */ 03724 if ((Src1 == NULL) || (Dest == NULL)) 03725 return(-1); 03726 if (length == 0) 03727 return(0); 03728 03729 /* Special case: T==0 */ 03730 if (T == 0) { 03731 memset(Dest, 255, length); 03732 return (0); 03733 } 03734 03735 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03736 03737 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T); 03738 03739 /* Check for unaligned bytes */ 03740 if ((length & 7) > 0) { 03741 /* Setup to process unaligned bytes */ 03742 istart = length & 0xfffffff8; 03743 cursrc1 = &Src1[istart]; 03744 curdest = &Dest[istart]; 03745 } else { 03746 /* No unaligned bytes - we are done */ 03747 return (0); 03748 } 03749 } else { 03750 /* Setup to process whole image */ 03751 istart = 0; 03752 cursrc1 = Src1; 03753 curdest = Dest; 03754 } 03755 03756 /* C routine to process image */ 03757 for (i = istart; i < length; i++) { 03758 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0); 03759 /* Advance pointers */ 03760 cursrc1++; 03761 curdest++; 03762 } 03763 03764 return (0); 03765 } 03766 03778 int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin, 03779 unsigned char Tmax) 03780 { 03781 #ifdef USE_MMX 03782 #if !defined(GCC__) 03783 __asm 03784 { 03785 pusha 03786 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03787 /* ** Duplicate Tmax in 8 bytes of MM3 ** */ 03788 mov al, Tmax /* load Tmax into AL */ 03789 mov ah, al /* copy AL into AH */ 03790 mov bx, ax /* copy AX into BX */ 03791 shl eax, 16 /* shift 2 bytes of EAX left */ 03792 mov ax, bx /* copy BX into AX */ 03793 movd mm3, eax /* copy EAX into MM3 */ 03794 movd mm4, eax /* copy EAX into MM4 */ 03795 punpckldq mm3, mm4 /* fill higher bytes of MM3 with Tmax */ 03796 psubusb mm1, mm3 /* store 0xFF - Tmax in MM1 */ 03797 /* ** Duplicate Tmin in 8 bytes of MM5 ** */ 03798 mov al, Tmin /* load Tmin into AL */ 03799 mov ah, al /* copy AL into AH */ 03800 mov bx, ax /* copy AX into BX */ 03801 shl eax, 16 /* shift 2 bytes of EAX left */ 03802 mov ax, bx /* copy BX into AX */ 03803 movd mm5, eax /* copy EAX into MM5 */ 03804 movd mm4, eax /* copy EAX into MM4 */ 03805 punpckldq mm5, mm4 /* fill higher bytes of MM5 with Tmin */ 03806 movq mm7, mm5 /* copy MM5 into MM7 */ 03807 paddusb mm7, mm1 /* store 0xFF - Tmax + Tmin in MM7 */ 03808 mov eax, Src1 /* load Src1 address into eax */ 03809 mov edi, Dest /* load Dest address into edi */ 03810 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03811 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03812 align 16 /* 16 byte alignment of the loop entry */ 03813 L1030: 03814 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 03815 paddusb mm0, mm1 /* MM0=SrcDest+(0xFF-Tmax) */ 03816 psubusb mm0, mm7 /* MM0=MM0-(0xFF-Tmax+Tmin) */ 03817 paddusb mm0, mm5 /* MM0=MM0+Tmin */ 03818 movq [edi], mm0 /* store result in Dest */ 03819 add eax, 8 /* increase Src1 register pointer by 8 */ 03820 add edi, 8 /* increase Dest register pointer by 8 */ 03821 dec ecx /* decrease loop counter */ 03822 jnz L1030 /* check loop termination, proceed if required */ 03823 emms /* exit MMX state */ 03824 popa 03825 } 03826 #else 03827 asm volatile 03828 ("pusha \n\t" "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ 03829 /* ** Duplicate Tmax in 8 bytes of MM3 ** */ 03830 "mov %4, %%al \n\t" /* load Tmax into AL */ 03831 "mov %%al, %%ah \n\t" /* copy AL into AH */ 03832 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 03833 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 03834 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 03835 "movd %%eax, %%mm3 \n\t" /* copy EAX into MM3 */ 03836 "movd %%eax, %%mm4 \n\t" /* copy EAX into MM4 */ 03837 "punpckldq %%mm4, %%mm3 \n\t" /* fill higher bytes of MM3 with Tmax */ 03838 "psubusb %%mm3, %%mm1 \n\t" /* store 0xFF - Tmax in MM1 */ 03839 /* ** Duplicate Tmin in 8 bytes of MM5 ** */ 03840 "mov %3, %%al \n\t" /* load Tmin into AL */ 03841 "mov %%al, %%ah \n\t" /* copy AL into AH */ 03842 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 03843 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 03844 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 03845 "movd %%eax, %%mm5 \n\t" /* copy EAX into MM5 */ 03846 "movd %%eax, %%mm4 \n\t" /* copy EAX into MM4 */ 03847 "punpckldq %%mm4, %%mm5 \n\t" /* fill higher bytes of MM5 with Tmin */ 03848 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */ 03849 "paddusb %%mm1, %%mm7 \n\t" /* store 0xFF - Tmax + Tmin in MM7 */ 03850 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 03851 "mov %0, %%edi \n\t" /* load Dest address into edi */ 03852 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 03853 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 03854 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 03855 "1: \n\t" 03856 "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ 03857 "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-Tmax) */ 03858 "psubusb %%mm7, %%mm0 \n\t" /* MM0=MM0-(0xFF-Tmax+Tmin) */ 03859 "paddusb %%mm5, %%mm0 \n\t" /* MM0=MM0+Tmin */ 03860 "movq %%mm0, (%%edi) \n\t" /* store result in Dest */ 03861 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 03862 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 03863 "dec %%ecx \n\t" /* decrease loop counter */ 03864 "jnz 1b \n\t" /* check loop termination, proceed if required */ 03865 "emms \n\t" /* exit MMX state */ 03866 "popa \n\t":"=m" (Dest) /* %0 */ 03867 :"m"(Src1), /* %1 */ 03868 "m"(SrcLength), /* %2 */ 03869 "m"(Tmin), /* %3 */ 03870 "m"(Tmax) /* %4 */ 03871 ); 03872 #endif 03873 return (0); 03874 #else 03875 return (-1); 03876 #endif 03877 } 03878 03890 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin, 03891 unsigned char Tmax) 03892 { 03893 unsigned int i, istart; 03894 unsigned char *cursrc1; 03895 unsigned char *curdest; 03896 03897 /* Validate input parameters */ 03898 if ((Src1 == NULL) || (Dest == NULL)) 03899 return(-1); 03900 if (length == 0) 03901 return(0); 03902 03903 /* Special case: Tmin==0 && Tmax = 255 */ 03904 if ((Tmin == 0) && (Tmax == 25)) { 03905 memcpy(Src1, Dest, length); 03906 return (0); 03907 } 03908 03909 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03910 03911 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax); 03912 03913 /* Check for unaligned bytes */ 03914 if ((length & 7) > 0) { 03915 /* Setup to process unaligned bytes */ 03916 istart = length & 0xfffffff8; 03917 cursrc1 = &Src1[istart]; 03918 curdest = &Dest[istart]; 03919 } else { 03920 /* No unaligned bytes - we are done */ 03921 return (0); 03922 } 03923 } else { 03924 /* Setup to process whole image */ 03925 istart = 0; 03926 cursrc1 = Src1; 03927 curdest = Dest; 03928 } 03929 03930 /* C routine to process image */ 03931 for (i = istart; i < length; i++) { 03932 if (*cursrc1 < Tmin) { 03933 *curdest = Tmin; 03934 } else if (*cursrc1 > Tmax) { 03935 *curdest = Tmax; 03936 } else { 03937 *curdest = *cursrc1; 03938 } 03939 /* Advance pointers */ 03940 cursrc1++; 03941 curdest++; 03942 } 03943 03944 return (0); 03945 } 03946 03960 int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax, 03961 int Nmin, int Nmax) 03962 { 03963 #ifdef USE_MMX 03964 #if !defined(GCC__) 03965 __asm 03966 { 03967 pusha 03968 mov ax, WORD PTR Nmax /* load Nmax in AX */ 03969 mov bx, WORD PTR Cmax /* load Cmax in BX */ 03970 sub ax, WORD PTR Nmin /* AX = Nmax - Nmin */ 03971 sub bx, WORD PTR Cmin /* BX = Cmax - Cmin */ 03972 jz L10311 /* check division by zero */ 03973 xor dx, dx /* prepare for division, zero DX */ 03974 div bx /* AX = AX/BX */ 03975 jmp L10312 03976 L10311: 03977 mov ax, 255 /* if div by zero, assume result max byte value */ 03978 L10312: /* ** Duplicate AX in 4 words of MM0 ** */ 03979 mov bx, ax /* copy AX into BX */ 03980 shl eax, 16 /* shift 2 bytes of EAX left */ 03981 mov ax, bx /* copy BX into AX */ 03982 movd mm0, eax /* copy EAX into MM0 */ 03983 movd mm1, eax /* copy EAX into MM1 */ 03984 punpckldq mm0, mm1 /* fill higher words of MM0 with AX */ 03985 /* ** Duplicate Cmin in 4 words of MM1 ** */ 03986 mov ax, WORD PTR Cmin /* load Cmin into AX */ 03987 mov bx, ax /* copy AX into BX */ 03988 shl eax, 16 /* shift 2 bytes of EAX left */ 03989 mov ax, bx /* copy BX into AX */ 03990 movd mm1, eax /* copy EAX into MM1 */ 03991 movd mm2, eax /* copy EAX into MM2 */ 03992 punpckldq mm1, mm2 /* fill higher words of MM1 with Cmin */ 03993 /* ** Duplicate Nmin in 4 words of MM2 ** */ 03994 mov ax, WORD PTR Nmin /* load Nmin into AX */ 03995 mov bx, ax /* copy AX into BX */ 03996 shl eax, 16 /* shift 2 bytes of EAX left */ 03997 mov ax, bx /* copy BX into AX */ 03998 movd mm2, eax /* copy EAX into MM2 */ 03999 movd mm3, eax /* copy EAX into MM3 */ 04000 punpckldq mm2, mm3 /* fill higher words of MM2 with Nmin */ 04001 pxor mm7, mm7 /* zero MM7 register */ 04002 mov eax, Src1 /* load Src1 address into eax */ 04003 mov edi, Dest /* load Dest address into edi */ 04004 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 04005 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 04006 align 16 /* 16 byte alignment of the loop entry */ 04007 L1031: 04008 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 04009 movq mm4, mm3 /* copy MM3 into MM4 */ 04010 punpcklbw mm3, mm7 /* unpack low bytes of SrcDest into words */ 04011 punpckhbw mm4, mm7 /* unpack high bytes of SrcDest into words */ 04012 psubusb mm3, mm1 /* S-Cmin, low bytes */ 04013 psubusb mm4, mm1 /* S-Cmin, high bytes */ 04014 pmullw mm3, mm0 /* MM0*(S-Cmin), low bytes */ 04015 pmullw mm4, mm0 /* MM0*(S-Cmin), high bytes */ 04016 paddusb mm3, mm2 /* MM0*(S-Cmin)+Nmin, low bytes */ 04017 paddusb mm4, mm2 /* MM0*(S-Cmin)+Nmin, high bytes */ 04018 /* ** Take abs value of the signed words ** */ 04019 movq mm5, mm3 /* copy mm3 into mm5 */ 04020 movq mm6, mm4 /* copy mm4 into mm6 */ 04021 psraw mm5, 15 /* fill mm5 words with word sign bit */ 04022 psraw mm6, 15 /* fill mm6 words with word sign bit */ 04023 pxor mm3, mm5 /* take 1's compliment of only neg words */ 04024 pxor mm4, mm6 /* take 1's compliment of only neg words */ 04025 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 04026 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 04027 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 04028 movq [edi], mm3 /* store result in Dest */ 04029 add eax, 8 /* increase Src1 register pointer by 8 */ 04030 add edi, 8 /* increase Dest register pointer by 8 */ 04031 dec ecx /* decrease loop counter */ 04032 jnz L1031 /* check loop termination, proceed if required */ 04033 emms /* exit MMX state */ 04034 popa 04035 } 04036 #else 04037 asm volatile 04038 ("pusha \n\t" "mov %6, %%ax \n\t" /* load Nmax in AX */ 04039 "mov %4, %%bx \n\t" /* load Cmax in BX */ 04040 "sub %5, %%ax \n\t" /* AX = Nmax - Nmin */ 04041 "sub %3, %%bx \n\t" /* BX = Cmax - Cmin */ 04042 "jz 1f \n\t" /* check division by zero */ 04043 "xor %%dx, %%dx \n\t" /* prepare for division, zero DX */ 04044 "div %%bx \n\t" /* AX = AX/BX */ 04045 "jmp 2f \n\t" "1: \n\t" "mov $255, %%ax \n\t" /* if div by zero, assume result max. byte value */ 04046 "2: \n\t" /* ** Duplicate AX in 4 words of MM0 ** */ 04047 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 04048 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 04049 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 04050 "movd %%eax, %%mm0 \n\t" /* copy EAX into MM0 */ 04051 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 04052 "punpckldq %%mm1, %%mm0 \n\t" /* fill higher words of MM0 with AX */ 04053 /* ** Duplicate Cmin in 4 words of MM1 ** */ 04054 "mov %3, %%ax \n\t" /* load Cmin into AX */ 04055 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 04056 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 04057 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 04058 "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ 04059 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 04060 "punpckldq %%mm2, %%mm1 \n\t" /* fill higher words of MM1 with Cmin */ 04061 /* ** Duplicate Nmin in 4 words of MM2 ** */ 04062 "mov %5, %%ax \n\t" /* load Nmin into AX */ 04063 "mov %%ax, %%bx \n\t" /* copy AX into BX */ 04064 "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ 04065 "mov %%bx, %%ax \n\t" /* copy BX into AX */ 04066 "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */ 04067 "movd %%eax, %%mm3 \n\t" /* copy EAX into MM3 */ 04068 "punpckldq %%mm3, %%mm2 \n\t" /* fill higher words of MM2 with Nmin */ 04069 "pxor %%mm7, %%mm7 \n\t" /* zero MM7 register */ 04070 "mov %1, %%eax \n\t" /* load Src1 address into eax */ 04071 "mov %0, %%edi \n\t" /* load Dest address into edi */ 04072 "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ 04073 "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ 04074 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04075 "1: \n\t" 04076 "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ 04077 "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ 04078 "punpcklbw %%mm7, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ 04079 "punpckhbw %%mm7, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ 04080 "psubusb %%mm1, %%mm3 \n\t" /* S-Cmin, low bytes */ 04081 "psubusb %%mm1, %%mm4 \n\t" /* S-Cmin, high bytes */ 04082 "pmullw %%mm0, %%mm3 \n\t" /* MM0*(S-Cmin), low bytes */ 04083 "pmullw %%mm0, %%mm4 \n\t" /* MM0*(S-Cmin), high bytes */ 04084 "paddusb %%mm2, %%mm3 \n\t" /* MM0*(S-Cmin)+Nmin, low bytes */ 04085 "paddusb %%mm2, %%mm4 \n\t" /* MM0*(S-Cmin)+Nmin, high bytes */ 04086 /* ** Take abs value of the signed words ** */ 04087 "movq %%mm3, %%mm5 \n\t" /* copy mm3 into mm5 */ 04088 "movq %%mm4, %%mm6 \n\t" /* copy mm4 into mm6 */ 04089 "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */ 04090 "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */ 04091 "pxor %%mm5, %%mm3 \n\t" /* take 1's compliment of only neg. words */ 04092 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 04093 "psubsw %%mm5, %%mm3 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 04094 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 04095 "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ 04096 "movq %%mm3, (%%edi) \n\t" /* store result in Dest */ 04097 "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ 04098 "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ 04099 "dec %%ecx \n\t" /* decrease loop counter */ 04100 "jnz 1b \n\t" /* check loop termination, proceed if required */ 04101 "emms \n\t" /* exit MMX state */ 04102 "popa \n\t":"=m" (Dest) /* %0 */ 04103 :"m"(Src1), /* %1 */ 04104 "m"(SrcLength), /* %2 */ 04105 "m"(Cmin), /* %3 */ 04106 "m"(Cmax), /* %4 */ 04107 "m"(Nmin), /* %5 */ 04108 "m"(Nmax) /* %6 */ 04109 ); 04110 #endif 04111 return (0); 04112 #else 04113 return (-1); 04114 #endif 04115 } 04116 04130 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin, 04131 int Nmax) 04132 { 04133 unsigned int i, istart; 04134 unsigned char *cursrc; 04135 unsigned char *curdest; 04136 int dN, dC, factor; 04137 int result; 04138 04139 /* Validate input parameters */ 04140 if ((Src == NULL) || (Dest == NULL)) 04141 return(-1); 04142 if (length == 0) 04143 return(0); 04144 04145 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 04146 04147 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax); 04148 04149 /* Check for unaligned bytes */ 04150 if ((length & 7) > 0) { 04151 /* Setup to process unaligned bytes */ 04152 istart = length & 0xfffffff8; 04153 cursrc = &Src[istart]; 04154 curdest = &Dest[istart]; 04155 } else { 04156 /* No unaligned bytes - we are done */ 04157 return (0); 04158 } 04159 } else { 04160 /* Setup to process whole image */ 04161 istart = 0; 04162 cursrc = Src; 04163 curdest = Dest; 04164 } 04165 04166 /* C routine to process image */ 04167 dC = Cmax - Cmin; 04168 if (dC == 0) 04169 return (0); 04170 dN = Nmax - Nmin; 04171 factor = dN / dC; 04172 for (i = istart; i < length; i++) { 04173 result = factor * ((int) (*cursrc) - Cmin) + Nmin; 04174 if (result > 255) 04175 result = 255; 04176 *curdest = (unsigned char) result; 04177 /* Advance pointers */ 04178 cursrc++; 04179 curdest++; 04180 } 04181 04182 return (0); 04183 } 04184 04185 /* ------------------------------------------------------------------------------------ */ 04186 04201 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04202 signed short *Kernel, unsigned char Divisor) 04203 { 04204 /* Validate input parameters */ 04205 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04206 return(-1); 04207 04208 if ((columns < 3) || (rows < 3) || (Divisor == 0)) 04209 return (-1); 04210 04211 if ((SDL_imageFilterMMXdetect())) { 04212 #ifdef USE_MMX 04213 #if !defined(GCC__) 04214 __asm 04215 { 04216 pusha 04217 pxor mm0, mm0 /* zero MM0 */ 04218 xor ebx, ebx /* zero EBX */ 04219 mov bl, Divisor /* load Divisor into BL */ 04220 mov edx, Kernel /* load Kernel address into EDX */ 04221 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */ 04222 add edx, 8 /* second row |K0 K1 K2 0| */ 04223 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 04224 add edx, 8 /* third row |K6 K7 K8 0| */ 04225 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */ 04226 /* ---, */ 04227 mov eax, columns /* load columns into EAX */ 04228 mov esi, Src /* ESI = Src row 0 address */ 04229 mov edi, Dest /* load Dest address to EDI */ 04230 add edi, eax /* EDI = EDI + columns */ 04231 inc edi /* 1 byte offset from the left edge */ 04232 mov edx, rows /* initialize ROWS counter */ 04233 sub edx, 2 /* do not use first and last row */ 04234 /* ---, */ 04235 L10320: 04236 mov ecx, eax /* initialize COLUMS counter */ 04237 sub ecx, 2 /* do not use first and last column */ 04238 align 16 /* 16 byte alignment of the loop entry */ 04239 L10322: 04240 /* ---, */ 04241 movq mm1, [esi] /* load 8 bytes of the image first row */ 04242 add esi, eax /* move one row below */ 04243 movq mm2, [esi] /* load 8 bytes of the image second row */ 04244 add esi, eax /* move one row below */ 04245 movq mm3, [esi] /* load 8 bytes of the image third row */ 04246 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04247 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */ 04248 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */ 04249 pmullw mm1, mm5 /* multiply words first row image*Kernel */ 04250 pmullw mm2, mm6 /* multiply words second row image*Kernel */ 04251 pmullw mm3, mm7 /* multiply words third row image*Kernel */ 04252 paddsw mm1, mm2 /* add 4 words of the first and second rows */ 04253 paddsw mm1, mm3 /* add 4 words of the third row and result */ 04254 movq mm2, mm1 /* copy MM1 into MM2 */ 04255 psrlq mm1, 32 /* shift 2 left words to the right */ 04256 paddsw mm1, mm2 /* add 2 left and 2 right result words */ 04257 movq mm3, mm1 /* copy MM1 into MM3 */ 04258 psrlq mm1, 16 /* shift 1 left word to the right */ 04259 paddsw mm1, mm3 /* add 1 left and 1 right result words */ 04260 /* --, */ 04261 movd mm2, eax /* save EAX in MM2 */ 04262 movd mm3, edx /* save EDX in MM3 */ 04263 movd eax, mm1 /* copy MM1 into EAX */ 04264 psraw mm1, 15 /* spread sign bit of the result */ 04265 movd edx, mm1 /* fill EDX with a sign bit */ 04266 idiv bx /* IDIV - VERY EXPENSIVE */ 04267 movd mm1, eax /* move result of division into MM1 */ 04268 packuswb mm1, mm0 /* pack division result with saturation */ 04269 movd eax, mm1 /* copy saturated result into EAX */ 04270 mov [edi], al /* copy a byte result into Dest */ 04271 movd edx, mm3 /* restore saved EDX */ 04272 movd eax, mm2 /* restore saved EAX */ 04273 /* --, */ 04274 sub esi, eax /* move two rows up */ 04275 sub esi, eax /* */ 04276 inc esi /* move Src pointer to the next pixel */ 04277 inc edi /* move Dest pointer to the next pixel */ 04278 /* ---, */ 04279 dec ecx /* decrease loop counter COLUMNS */ 04280 jnz L10322 /* check loop termination, proceed if required */ 04281 add esi, 2 /* move to the next row in Src */ 04282 add edi, 2 /* move to the next row in Dest */ 04283 dec edx /* decrease loop counter ROWS */ 04284 jnz L10320 /* check loop termination, proceed if required */ 04285 /* ---, */ 04286 emms /* exit MMX state */ 04287 popa 04288 } 04289 #else 04290 asm volatile 04291 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04292 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04293 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04294 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04295 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */ 04296 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */ 04297 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 04298 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */ 04299 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */ 04300 /* --- */ 04301 "mov %3, %%eax \n\t" /* load columns into EAX */ 04302 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 04303 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04304 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 04305 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 04306 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 04307 "sub $2, %%edx \n\t" /* do not use first and last row */ 04308 /* --- */ 04309 ".L10320: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 04310 "sub $2, %%ecx \n\t" /* do not use first and last column */ 04311 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04312 ".L10322: \n\t" 04313 /* --- */ 04314 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */ 04315 "add %%eax, %%esi \n\t" /* move one row below */ 04316 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */ 04317 "add %%eax, %%esi \n\t" /* move one row below */ 04318 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */ 04319 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04320 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */ 04321 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */ 04322 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */ 04323 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */ 04324 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */ 04325 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */ 04326 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */ 04327 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04328 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */ 04329 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */ 04330 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */ 04331 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */ 04332 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */ 04333 /* -- */ 04334 "movd %%eax, %%mm2 \n\t" /* save EAX in MM2 */ 04335 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04336 "movd %%mm1, %%eax \n\t" /* copy MM1 into EAX */ 04337 "psraw $15, %%mm1 \n\t" /* spread sign bit of the result */ 04338 "movd %%mm1, %%edx \n\t" /* fill EDX with a sign bit */ 04339 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04340 "movd %%eax, %%mm1 \n\t" /* move result of division into MM1 */ 04341 "packuswb %%mm0, %%mm1 \n\t" /* pack division result with saturation */ 04342 "movd %%mm1, %%eax \n\t" /* copy saturated result into EAX */ 04343 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04344 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04345 "movd %%mm2, %%eax \n\t" /* restore saved EAX */ 04346 /* -- */ 04347 "sub %%eax, %%esi \n\t" /* move two rows up */ 04348 "sub %%eax, %%esi \n\t" /* */ 04349 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04350 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04351 /* --- */ 04352 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04353 "jnz .L10322 \n\t" /* check loop termination, proceed if required */ 04354 "add $2, %%esi \n\t" /* move to the next row in Src */ 04355 "add $2, %%edi \n\t" /* move to the next row in Dest */ 04356 "dec %%edx \n\t" /* decrease loop counter ROWS */ 04357 "jnz .L10320 \n\t" /* check loop termination, proceed if required */ 04358 /* --- */ 04359 "emms \n\t" /* exit MMX state */ 04360 "popa \n\t":"=m" (Dest) /* %0 */ 04361 :"m"(Src), /* %1 */ 04362 "m"(rows), /* %2 */ 04363 "m"(columns), /* %3 */ 04364 "m"(Kernel), /* %4 */ 04365 "m"(Divisor) /* %5 */ 04366 ); 04367 #endif 04368 #endif 04369 return (0); 04370 } else { 04371 /* No non-MMX implementation yet */ 04372 return (-1); 04373 } 04374 } 04375 04390 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04391 signed short *Kernel, unsigned char Divisor) 04392 { 04393 /* Validate input parameters */ 04394 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04395 return(-1); 04396 04397 if ((columns < 5) || (rows < 5) || (Divisor == 0)) 04398 return (-1); 04399 04400 if ((SDL_imageFilterMMXdetect())) { 04401 #ifdef USE_MMX 04402 #if !defined(GCC__) 04403 __asm 04404 { 04405 pusha 04406 pxor mm0, mm0 /* zero MM0 */ 04407 xor ebx, ebx /* zero EBX */ 04408 mov bl, Divisor /* load Divisor into BL */ 04409 movd mm5, ebx /* copy Divisor into MM5 */ 04410 mov edx, Kernel /* load Kernel address into EDX */ 04411 mov esi, Src /* load Src address to ESI */ 04412 mov edi, Dest /* load Dest address to EDI */ 04413 add edi, 2 /* 2 column offset from the left edge */ 04414 mov eax, columns /* load columns into EAX */ 04415 shl eax, 1 /* EAX = columns * 2 */ 04416 add edi, eax /* 2 row offset from the top edge */ 04417 shr eax, 1 /* EAX = columns */ 04418 mov ebx, rows /* initialize ROWS counter */ 04419 sub ebx, 4 /* do not use first 2 and last 2 rows */ 04420 /* ---, */ 04421 L10330: 04422 mov ecx, eax /* initialize COLUMNS counter */ 04423 sub ecx, 4 /* do not use first 2 and last 2 columns */ 04424 align 16 /* 16 byte alignment of the loop entry */ 04425 L10332: 04426 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04427 movd mm6, esi /* save ESI in MM6 */ 04428 /* --- 1 */ 04429 movq mm1, [esi] /* load 8 bytes of the Src */ 04430 movq mm2, mm1 /* copy MM1 into MM2 */ 04431 add esi, eax /* move Src pointer 1 row below */ 04432 movq mm3, [edx] /* load 4 words of Kernel */ 04433 add edx, 8 /* move pointer to other 4 words */ 04434 movq mm4, [edx] /* load 4 words of Kernel */ 04435 add edx, 8 /* move pointer to other 4 words */ 04436 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04437 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04438 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04439 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04440 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04441 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04442 /* --- 2 */ 04443 movq mm1, [esi] /* load 8 bytes of the Src */ 04444 movq mm2, mm1 /* copy MM1 into MM2 */ 04445 add esi, eax /* move Src pointer 1 row below */ 04446 movq mm3, [edx] /* load 4 words of Kernel */ 04447 add edx, 8 /* move pointer to other 4 words */ 04448 movq mm4, [edx] /* load 4 words of Kernel */ 04449 add edx, 8 /* move pointer to other 4 words */ 04450 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04451 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04452 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04453 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04454 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04455 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04456 /* --- 3 */ 04457 movq mm1, [esi] /* load 8 bytes of the Src */ 04458 movq mm2, mm1 /* copy MM1 into MM2 */ 04459 add esi, eax /* move Src pointer 1 row below */ 04460 movq mm3, [edx] /* load 4 words of Kernel */ 04461 add edx, 8 /* move pointer to other 4 words */ 04462 movq mm4, [edx] /* load 4 words of Kernel */ 04463 add edx, 8 /* move pointer to other 4 words */ 04464 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04465 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04466 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04467 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04468 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04469 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04470 /* --- 4 */ 04471 movq mm1, [esi] /* load 8 bytes of the Src */ 04472 movq mm2, mm1 /* copy MM1 into MM2 */ 04473 add esi, eax /* move Src pointer 1 row below */ 04474 movq mm3, [edx] /* load 4 words of Kernel */ 04475 add edx, 8 /* move pointer to other 4 words */ 04476 movq mm4, [edx] /* load 4 words of Kernel */ 04477 add edx, 8 /* move pointer to other 4 words */ 04478 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04479 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04480 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04481 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04482 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04483 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04484 /* --- 5 */ 04485 movq mm1, [esi] /* load 8 bytes of the Src */ 04486 movq mm2, mm1 /* copy MM1 into MM2 */ 04487 movq mm3, [edx] /* load 4 words of Kernel */ 04488 add edx, 8 /* move pointer to other 4 words */ 04489 movq mm4, [edx] /* load 4 words of Kernel */ 04490 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04491 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04492 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04493 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04494 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04495 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04496 /* ---, */ 04497 movq mm3, mm7 /* copy MM7 into MM3 */ 04498 psrlq mm7, 32 /* shift 2 left words to the right */ 04499 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 04500 movq mm2, mm7 /* copy MM7 into MM2 */ 04501 psrlq mm7, 16 /* shift 1 left word to the right */ 04502 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 04503 /* ---, */ 04504 movd mm1, eax /* save EDX in MM1 */ 04505 movd mm2, ebx /* save EDX in MM2 */ 04506 movd mm3, edx /* save EDX in MM3 */ 04507 movd eax, mm7 /* load summation result into EAX */ 04508 psraw mm7, 15 /* spread sign bit of the result */ 04509 movd ebx, mm5 /* load Divisor into EBX */ 04510 movd edx, mm7 /* fill EDX with a sign bit */ 04511 idiv bx /* IDIV - VERY EXPENSIVE */ 04512 movd mm7, eax /* move result of division into MM7 */ 04513 packuswb mm7, mm0 /* pack division result with saturation */ 04514 movd eax, mm7 /* copy saturated result into EAX */ 04515 mov [edi], al /* copy a byte result into Dest */ 04516 movd edx, mm3 /* restore saved EDX */ 04517 movd ebx, mm2 /* restore saved EBX */ 04518 movd eax, mm1 /* restore saved EAX */ 04519 /* --, */ 04520 movd esi, mm6 /* move Src pointer to the top pixel */ 04521 sub edx, 72 /* EDX = Kernel address */ 04522 inc esi /* move Src pointer to the next pixel */ 04523 inc edi /* move Dest pointer to the next pixel */ 04524 /* ---, */ 04525 dec ecx /* decrease loop counter COLUMNS */ 04526 jnz L10332 /* check loop termination, proceed if required */ 04527 add esi, 4 /* move to the next row in Src */ 04528 add edi, 4 /* move to the next row in Dest */ 04529 dec ebx /* decrease loop counter ROWS */ 04530 jnz L10330 /* check loop termination, proceed if required */ 04531 /* ---, */ 04532 emms /* exit MMX state */ 04533 popa 04534 } 04535 #else 04536 asm volatile 04537 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04538 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04539 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04540 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 04541 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04542 "mov %1, %%esi \n\t" /* load Src address to ESI */ 04543 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04544 "add $2, %%edi \n\t" /* 2 column offset from the left edge */ 04545 "mov %3, %%eax \n\t" /* load columns into EAX */ 04546 "shl $1, %%eax \n\t" /* EAX = columns * 2 */ 04547 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */ 04548 "shr $1, %%eax \n\t" /* EAX = columns */ 04549 "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 04550 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */ 04551 /* --- */ 04552 ".L10330: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 04553 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */ 04554 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04555 ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 04556 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 04557 /* --- 1 */ 04558 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04559 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04560 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04561 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04562 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04563 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04564 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04565 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04566 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04567 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04568 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04569 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04570 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04571 /* --- 2 */ 04572 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04573 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04574 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04575 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04576 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04577 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04578 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04579 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04580 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04581 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04582 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04583 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04584 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04585 /* --- 3 */ 04586 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04587 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04588 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04589 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04590 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04591 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04592 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04593 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04594 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04595 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04596 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04597 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04598 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04599 /* --- 4 */ 04600 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04601 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04602 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04603 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04604 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04605 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04606 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04607 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04608 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04609 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04610 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04611 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04612 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04613 /* --- 5 */ 04614 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04615 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04616 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04617 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04618 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04619 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04620 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04621 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04622 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04623 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04624 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04625 /* --- */ 04626 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 04627 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 04628 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 04629 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 04630 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 04631 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 04632 /* --- */ 04633 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 04634 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 04635 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04636 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 04637 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 04638 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 04639 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 04640 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04641 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 04642 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 04643 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 04644 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04645 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04646 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 04647 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 04648 /* -- */ 04649 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 04650 "sub $72, %%edx \n\t" /* EDX = Kernel address */ 04651 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04652 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04653 /* --- */ 04654 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04655 "jnz .L10332 \n\t" /* check loop termination, proceed if required */ 04656 "add $4, %%esi \n\t" /* move to the next row in Src */ 04657 "add $4, %%edi \n\t" /* move to the next row in Dest */ 04658 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 04659 "jnz .L10330 \n\t" /* check loop termination, proceed if required */ 04660 /* --- */ 04661 "emms \n\t" /* exit MMX state */ 04662 "popa \n\t":"=m" (Dest) /* %0 */ 04663 :"m"(Src), /* %1 */ 04664 "m"(rows), /* %2 */ 04665 "m"(columns), /* %3 */ 04666 "m"(Kernel), /* %4 */ 04667 "m"(Divisor) /* %5 */ 04668 ); 04669 #endif 04670 #endif 04671 return (0); 04672 } else { 04673 /* No non-MMX implementation yet */ 04674 return (-1); 04675 } 04676 } 04677 04692 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04693 signed short *Kernel, unsigned char Divisor) 04694 { 04695 /* Validate input parameters */ 04696 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04697 return(-1); 04698 04699 if ((columns < 7) || (rows < 7) || (Divisor == 0)) 04700 return (-1); 04701 04702 if ((SDL_imageFilterMMXdetect())) { 04703 #ifdef USE_MMX 04704 #if !defined(GCC__) 04705 __asm 04706 { 04707 pusha 04708 pxor mm0, mm0 /* zero MM0 */ 04709 xor ebx, ebx /* zero EBX */ 04710 mov bl, Divisor /* load Divisor into BL */ 04711 movd mm5, ebx /* copy Divisor into MM5 */ 04712 mov edx, Kernel /* load Kernel address into EDX */ 04713 mov esi, Src /* load Src address to ESI */ 04714 mov edi, Dest /* load Dest address to EDI */ 04715 add edi, 3 /* 3 column offset from the left edge */ 04716 mov eax, columns /* load columns into EAX */ 04717 add edi, eax /* 3 row offset from the top edge */ 04718 add edi, eax 04719 add edi, eax 04720 mov ebx, rows /* initialize ROWS counter */ 04721 sub ebx, 6 /* do not use first 3 and last 3 rows */ 04722 /* ---, */ 04723 L10340: 04724 mov ecx, eax /* initialize COLUMNS counter */ 04725 sub ecx, 6 /* do not use first 3 and last 3 columns */ 04726 align 16 /* 16 byte alignment of the loop entry */ 04727 L10342: 04728 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04729 movd mm6, esi /* save ESI in MM6 */ 04730 /* --- 1 */ 04731 movq mm1, [esi] /* load 8 bytes of the Src */ 04732 movq mm2, mm1 /* copy MM1 into MM2 */ 04733 add esi, eax /* move Src pointer 1 row below */ 04734 movq mm3, [edx] /* load 4 words of Kernel */ 04735 add edx, 8 /* move pointer to other 4 words */ 04736 movq mm4, [edx] /* load 4 words of Kernel */ 04737 add edx, 8 /* move pointer to other 4 words */ 04738 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04739 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04740 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04741 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04742 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04743 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04744 /* --- 2 */ 04745 movq mm1, [esi] /* load 8 bytes of the Src */ 04746 movq mm2, mm1 /* copy MM1 into MM2 */ 04747 add esi, eax /* move Src pointer 1 row below */ 04748 movq mm3, [edx] /* load 4 words of Kernel */ 04749 add edx, 8 /* move pointer to other 4 words */ 04750 movq mm4, [edx] /* load 4 words of Kernel */ 04751 add edx, 8 /* move pointer to other 4 words */ 04752 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04753 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04754 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04755 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04756 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04757 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04758 /* --- 3 */ 04759 movq mm1, [esi] /* load 8 bytes of the Src */ 04760 movq mm2, mm1 /* copy MM1 into MM2 */ 04761 add esi, eax /* move Src pointer 1 row below */ 04762 movq mm3, [edx] /* load 4 words of Kernel */ 04763 add edx, 8 /* move pointer to other 4 words */ 04764 movq mm4, [edx] /* load 4 words of Kernel */ 04765 add edx, 8 /* move pointer to other 4 words */ 04766 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04767 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04768 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04769 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04770 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04771 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04772 /* --- 4 */ 04773 movq mm1, [esi] /* load 8 bytes of the Src */ 04774 movq mm2, mm1 /* copy MM1 into MM2 */ 04775 add esi, eax /* move Src pointer 1 row below */ 04776 movq mm3, [edx] /* load 4 words of Kernel */ 04777 add edx, 8 /* move pointer to other 4 words */ 04778 movq mm4, [edx] /* load 4 words of Kernel */ 04779 add edx, 8 /* move pointer to other 4 words */ 04780 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04781 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04782 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04783 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04784 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04785 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04786 /* --- 5 */ 04787 movq mm1, [esi] /* load 8 bytes of the Src */ 04788 movq mm2, mm1 /* copy MM1 into MM2 */ 04789 add esi, eax /* move Src pointer 1 row below */ 04790 movq mm3, [edx] /* load 4 words of Kernel */ 04791 add edx, 8 /* move pointer to other 4 words */ 04792 movq mm4, [edx] /* load 4 words of Kernel */ 04793 add edx, 8 /* move pointer to other 4 words */ 04794 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04795 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04796 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04797 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04798 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04799 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04800 /* --- 6 */ 04801 movq mm1, [esi] /* load 8 bytes of the Src */ 04802 movq mm2, mm1 /* copy MM1 into MM2 */ 04803 add esi, eax /* move Src pointer 1 row below */ 04804 movq mm3, [edx] /* load 4 words of Kernel */ 04805 add edx, 8 /* move pointer to other 4 words */ 04806 movq mm4, [edx] /* load 4 words of Kernel */ 04807 add edx, 8 /* move pointer to other 4 words */ 04808 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04809 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04810 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04811 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04812 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04813 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04814 /* --- 7 */ 04815 movq mm1, [esi] /* load 8 bytes of the Src */ 04816 movq mm2, mm1 /* copy MM1 into MM2 */ 04817 movq mm3, [edx] /* load 4 words of Kernel */ 04818 add edx, 8 /* move pointer to other 4 words */ 04819 movq mm4, [edx] /* load 4 words of Kernel */ 04820 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04821 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04822 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04823 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04824 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04825 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04826 /* ---, */ 04827 movq mm3, mm7 /* copy MM7 into MM3 */ 04828 psrlq mm7, 32 /* shift 2 left words to the right */ 04829 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 04830 movq mm2, mm7 /* copy MM7 into MM2 */ 04831 psrlq mm7, 16 /* shift 1 left word to the right */ 04832 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 04833 /* ---, */ 04834 movd mm1, eax /* save EDX in MM1 */ 04835 movd mm2, ebx /* save EDX in MM2 */ 04836 movd mm3, edx /* save EDX in MM3 */ 04837 movd eax, mm7 /* load summation result into EAX */ 04838 psraw mm7, 15 /* spread sign bit of the result */ 04839 movd ebx, mm5 /* load Divisor into EBX */ 04840 movd edx, mm7 /* fill EDX with a sign bit */ 04841 idiv bx /* IDIV - VERY EXPENSIVE */ 04842 movd mm7, eax /* move result of division into MM7 */ 04843 packuswb mm7, mm0 /* pack division result with saturation */ 04844 movd eax, mm7 /* copy saturated result into EAX */ 04845 mov [edi], al /* copy a byte result into Dest */ 04846 movd edx, mm3 /* restore saved EDX */ 04847 movd ebx, mm2 /* restore saved EBX */ 04848 movd eax, mm1 /* restore saved EAX */ 04849 /* --, */ 04850 movd esi, mm6 /* move Src pointer to the top pixel */ 04851 sub edx, 104 /* EDX = Kernel address */ 04852 inc esi /* move Src pointer to the next pixel */ 04853 inc edi /* move Dest pointer to the next pixel */ 04854 /* ---, */ 04855 dec ecx /* decrease loop counter COLUMNS */ 04856 jnz L10342 /* check loop termination, proceed if required */ 04857 add esi, 6 /* move to the next row in Src */ 04858 add edi, 6 /* move to the next row in Dest */ 04859 dec ebx /* decrease loop counter ROWS */ 04860 jnz L10340 /* check loop termination, proceed if required */ 04861 /* ---, */ 04862 emms /* exit MMX state */ 04863 popa 04864 } 04865 #else 04866 asm volatile 04867 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04868 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04869 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04870 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 04871 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04872 "mov %1, %%esi \n\t" /* load Src address to ESI */ 04873 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04874 "add $3, %%edi \n\t" /* 3 column offset from the left edge */ 04875 "mov %3, %%eax \n\t" /* load columns into EAX */ 04876 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */ 04877 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 04878 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */ 04879 /* --- */ 04880 ".L10340: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 04881 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */ 04882 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04883 ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 04884 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 04885 /* --- 1 */ 04886 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04887 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04888 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04889 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04890 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04891 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04892 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04893 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04894 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04895 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04896 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04897 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04898 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04899 /* --- 2 */ 04900 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04901 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04902 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04903 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04904 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04905 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04906 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04907 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04908 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04909 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04910 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04911 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04912 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04913 /* --- 3 */ 04914 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04915 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04916 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04917 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04918 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04919 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04920 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04921 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04922 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04923 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04924 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04925 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04926 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04927 /* --- 4 */ 04928 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04929 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04930 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04931 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04932 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04933 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04934 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04935 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04936 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04937 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04938 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04939 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04940 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04941 /* --- 5 */ 04942 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04943 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04944 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04945 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04946 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04947 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04948 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04949 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04950 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04951 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04952 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04953 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04954 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04955 /* --- 6 */ 04956 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04957 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04958 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04959 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04960 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04961 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04962 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04963 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04964 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04965 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04966 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04967 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04968 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04969 /* --- 7 */ 04970 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04971 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04972 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04973 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04974 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04975 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04976 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04977 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04978 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04979 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04980 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04981 /* --- */ 04982 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 04983 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 04984 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 04985 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 04986 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 04987 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 04988 /* --- */ 04989 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 04990 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 04991 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04992 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 04993 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 04994 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 04995 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 04996 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04997 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 04998 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 04999 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 05000 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 05001 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 05002 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 05003 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 05004 /* -- */ 05005 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 05006 "sub $104, %%edx \n\t" /* EDX = Kernel address */ 05007 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05008 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05009 /* --- */ 05010 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05011 "jnz .L10342 \n\t" /* check loop termination, proceed if required */ 05012 "add $6, %%esi \n\t" /* move to the next row in Src */ 05013 "add $6, %%edi \n\t" /* move to the next row in Dest */ 05014 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 05015 "jnz .L10340 \n\t" /* check loop termination, proceed if required */ 05016 /* --- */ 05017 "emms \n\t" /* exit MMX state */ 05018 "popa \n\t":"=m" (Dest) /* %0 */ 05019 :"m"(Src), /* %1 */ 05020 "m"(rows), /* %2 */ 05021 "m"(columns), /* %3 */ 05022 "m"(Kernel), /* %4 */ 05023 "m"(Divisor) /* %5 */ 05024 ); 05025 #endif 05026 #endif 05027 return (0); 05028 } else { 05029 /* No non-MMX implementation yet */ 05030 return (-1); 05031 } 05032 } 05033 05048 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05049 signed short *Kernel, unsigned char Divisor) 05050 { 05051 /* Validate input parameters */ 05052 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05053 return(-1); 05054 05055 if ((columns < 9) || (rows < 9) || (Divisor == 0)) 05056 return (-1); 05057 05058 if ((SDL_imageFilterMMXdetect())) { 05059 #ifdef USE_MMX 05060 #if !defined(GCC__) 05061 __asm 05062 { 05063 pusha 05064 pxor mm0, mm0 /* zero MM0 */ 05065 xor ebx, ebx /* zero EBX */ 05066 mov bl, Divisor /* load Divisor into BL */ 05067 movd mm5, ebx /* copy Divisor into MM5 */ 05068 mov edx, Kernel /* load Kernel address into EDX */ 05069 mov esi, Src /* load Src address to ESI */ 05070 mov edi, Dest /* load Dest address to EDI */ 05071 add edi, 4 /* 4 column offset from the left edge */ 05072 mov eax, columns /* load columns into EAX */ 05073 add edi, eax /* 4 row offset from the top edge */ 05074 add edi, eax 05075 add edi, eax 05076 add edi, eax 05077 mov ebx, rows /* initialize ROWS counter */ 05078 sub ebx, 8 /* do not use first 4 and last 4 rows */ 05079 /* ---, */ 05080 L10350: 05081 mov ecx, eax /* initialize COLUMNS counter */ 05082 sub ecx, 8 /* do not use first 4 and last 4 columns */ 05083 align 16 /* 16 byte alignment of the loop entry */ 05084 L10352: 05085 pxor mm7, mm7 /* zero MM7 (accumulator) */ 05086 movd mm6, esi /* save ESI in MM6 */ 05087 /* --- 1 */ 05088 movq mm1, [esi] /* load 8 bytes of the Src */ 05089 movq mm2, mm1 /* copy MM1 into MM2 */ 05090 inc esi /* move pointer to the next 8 bytes of Src */ 05091 movq mm3, [edx] /* load 4 words of Kernel */ 05092 add edx, 8 /* move pointer to other 4 words */ 05093 movq mm4, [edx] /* load 4 words of Kernel */ 05094 add edx, 8 /* move pointer to other 4 words */ 05095 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05096 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05097 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05098 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05099 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05100 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05101 movq mm1, [esi] /* load 8 bytes of the Src */ 05102 dec esi 05103 add esi, eax /* move Src pointer 1 row below */ 05104 movq mm3, [edx] /* load 4 words of Kernel */ 05105 add edx, 8 /* move pointer to other 4 words */ 05106 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05107 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05108 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05109 /* --- 2 */ 05110 movq mm1, [esi] /* load 8 bytes of the Src */ 05111 movq mm2, mm1 /* copy MM1 into MM2 */ 05112 inc esi /* move pointer to the next 8 bytes of Src */ 05113 movq mm3, [edx] /* load 4 words of Kernel */ 05114 add edx, 8 /* move pointer to other 4 words */ 05115 movq mm4, [edx] /* load 4 words of Kernel */ 05116 add edx, 8 /* move pointer to other 4 words */ 05117 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05118 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05119 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05120 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05121 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05122 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05123 movq mm1, [esi] /* load 8 bytes of the Src */ 05124 dec esi 05125 add esi, eax /* move Src pointer 1 row below */ 05126 movq mm3, [edx] /* load 4 words of Kernel */ 05127 add edx, 8 /* move pointer to other 4 words */ 05128 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05129 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05130 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05131 /* --- 3 */ 05132 movq mm1, [esi] /* load 8 bytes of the Src */ 05133 movq mm2, mm1 /* copy MM1 into MM2 */ 05134 inc esi /* move pointer to the next 8 bytes of Src */ 05135 movq mm3, [edx] /* load 4 words of Kernel */ 05136 add edx, 8 /* move pointer to other 4 words */ 05137 movq mm4, [edx] /* load 4 words of Kernel */ 05138 add edx, 8 /* move pointer to other 4 words */ 05139 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05140 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05141 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05142 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05143 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05144 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05145 movq mm1, [esi] /* load 8 bytes of the Src */ 05146 dec esi 05147 add esi, eax /* move Src pointer 1 row below */ 05148 movq mm3, [edx] /* load 4 words of Kernel */ 05149 add edx, 8 /* move pointer to other 4 words */ 05150 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05151 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05152 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05153 /* --- 4 */ 05154 movq mm1, [esi] /* load 8 bytes of the Src */ 05155 movq mm2, mm1 /* copy MM1 into MM2 */ 05156 inc esi /* move pointer to the next 8 bytes of Src */ 05157 movq mm3, [edx] /* load 4 words of Kernel */ 05158 add edx, 8 /* move pointer to other 4 words */ 05159 movq mm4, [edx] /* load 4 words of Kernel */ 05160 add edx, 8 /* move pointer to other 4 words */ 05161 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05162 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05163 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05164 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05165 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05166 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05167 movq mm1, [esi] /* load 8 bytes of the Src */ 05168 dec esi 05169 add esi, eax /* move Src pointer 1 row below */ 05170 movq mm3, [edx] /* load 4 words of Kernel */ 05171 add edx, 8 /* move pointer to other 4 words */ 05172 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05173 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05174 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05175 /* --- 5 */ 05176 movq mm1, [esi] /* load 8 bytes of the Src */ 05177 movq mm2, mm1 /* copy MM1 into MM2 */ 05178 inc esi /* move pointer to the next 8 bytes of Src */ 05179 movq mm3, [edx] /* load 4 words of Kernel */ 05180 add edx, 8 /* move pointer to other 4 words */ 05181 movq mm4, [edx] /* load 4 words of Kernel */ 05182 add edx, 8 /* move pointer to other 4 words */ 05183 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05184 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05185 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05186 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05187 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05188 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05189 movq mm1, [esi] /* load 8 bytes of the Src */ 05190 dec esi 05191 add esi, eax /* move Src pointer 1 row below */ 05192 movq mm3, [edx] /* load 4 words of Kernel */ 05193 add edx, 8 /* move pointer to other 4 words */ 05194 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05195 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05196 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05197 /* --- 6 */ 05198 movq mm1, [esi] /* load 8 bytes of the Src */ 05199 movq mm2, mm1 /* copy MM1 into MM2 */ 05200 inc esi /* move pointer to the next 8 bytes of Src */ 05201 movq mm3, [edx] /* load 4 words of Kernel */ 05202 add edx, 8 /* move pointer to other 4 words */ 05203 movq mm4, [edx] /* load 4 words of Kernel */ 05204 add edx, 8 /* move pointer to other 4 words */ 05205 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05206 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05207 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05208 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05209 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05210 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05211 movq mm1, [esi] /* load 8 bytes of the Src */ 05212 dec esi 05213 add esi, eax /* move Src pointer 1 row below */ 05214 movq mm3, [edx] /* load 4 words of Kernel */ 05215 add edx, 8 /* move pointer to other 4 words */ 05216 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05217 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05218 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05219 /* --- 7 */ 05220 movq mm1, [esi] /* load 8 bytes of the Src */ 05221 movq mm2, mm1 /* copy MM1 into MM2 */ 05222 inc esi /* move pointer to the next 8 bytes of Src */ 05223 movq mm3, [edx] /* load 4 words of Kernel */ 05224 add edx, 8 /* move pointer to other 4 words */ 05225 movq mm4, [edx] /* load 4 words of Kernel */ 05226 add edx, 8 /* move pointer to other 4 words */ 05227 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05228 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05229 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05230 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05231 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05232 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05233 movq mm1, [esi] /* load 8 bytes of the Src */ 05234 dec esi 05235 add esi, eax /* move Src pointer 1 row below */ 05236 movq mm3, [edx] /* load 4 words of Kernel */ 05237 add edx, 8 /* move pointer to other 4 words */ 05238 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05239 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05240 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05241 /* --- 8 */ 05242 movq mm1, [esi] /* load 8 bytes of the Src */ 05243 movq mm2, mm1 /* copy MM1 into MM2 */ 05244 inc esi /* move pointer to the next 8 bytes of Src */ 05245 movq mm3, [edx] /* load 4 words of Kernel */ 05246 add edx, 8 /* move pointer to other 4 words */ 05247 movq mm4, [edx] /* load 4 words of Kernel */ 05248 add edx, 8 /* move pointer to other 4 words */ 05249 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05250 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05251 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05252 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05253 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05254 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05255 movq mm1, [esi] /* load 8 bytes of the Src */ 05256 dec esi 05257 add esi, eax /* move Src pointer 1 row below */ 05258 movq mm3, [edx] /* load 4 words of Kernel */ 05259 add edx, 8 /* move pointer to other 4 words */ 05260 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05261 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05262 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05263 /* --- 9 */ 05264 movq mm1, [esi] /* load 8 bytes of the Src */ 05265 movq mm2, mm1 /* copy MM1 into MM2 */ 05266 inc esi /* move pointer to the next 8 bytes of Src */ 05267 movq mm3, [edx] /* load 4 words of Kernel */ 05268 add edx, 8 /* move pointer to other 4 words */ 05269 movq mm4, [edx] /* load 4 words of Kernel */ 05270 add edx, 8 /* move pointer to other 4 words */ 05271 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05272 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05273 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05274 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05275 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05276 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05277 movq mm1, [esi] /* load 8 bytes of the Src */ 05278 movq mm3, [edx] /* load 4 words of Kernel */ 05279 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05280 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05281 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05282 /* ---, */ 05283 movq mm3, mm7 /* copy MM7 into MM3 */ 05284 psrlq mm7, 32 /* shift 2 left words to the right */ 05285 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 05286 movq mm2, mm7 /* copy MM7 into MM2 */ 05287 psrlq mm7, 16 /* shift 1 left word to the right */ 05288 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 05289 /* ---, */ 05290 movd mm1, eax /* save EDX in MM1 */ 05291 movd mm2, ebx /* save EDX in MM2 */ 05292 movd mm3, edx /* save EDX in MM3 */ 05293 movd eax, mm7 /* load summation result into EAX */ 05294 psraw mm7, 15 /* spread sign bit of the result */ 05295 movd ebx, mm5 /* load Divisor into EBX */ 05296 movd edx, mm7 /* fill EDX with a sign bit */ 05297 idiv bx /* IDIV - VERY EXPENSIVE */ 05298 movd mm7, eax /* move result of division into MM7 */ 05299 packuswb mm7, mm0 /* pack division result with saturation */ 05300 movd eax, mm7 /* copy saturated result into EAX */ 05301 mov [edi], al /* copy a byte result into Dest */ 05302 movd edx, mm3 /* restore saved EDX */ 05303 movd ebx, mm2 /* restore saved EBX */ 05304 movd eax, mm1 /* restore saved EAX */ 05305 /* --, */ 05306 movd esi, mm6 /* move Src pointer to the top pixel */ 05307 sub edx, 208 /* EDX = Kernel address */ 05308 inc esi /* move Src pointer to the next pixel */ 05309 inc edi /* move Dest pointer to the next pixel */ 05310 /* ---, */ 05311 dec ecx /* decrease loop counter COLUMNS */ 05312 jnz L10352 /* check loop termination, proceed if required */ 05313 add esi, 8 /* move to the next row in Src */ 05314 add edi, 8 /* move to the next row in Dest */ 05315 dec ebx /* decrease loop counter ROWS */ 05316 jnz L10350 /* check loop termination, proceed if required */ 05317 /* ---, */ 05318 emms /* exit MMX state */ 05319 popa 05320 } 05321 #else 05322 asm volatile 05323 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05324 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05325 "mov %5, %%bl \n\t" /* load Divisor into BL */ 05326 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 05327 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05328 "mov %1, %%esi \n\t" /* load Src address to ESI */ 05329 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05330 "add $4, %%edi \n\t" /* 4 column offset from the left edge */ 05331 "mov %3, %%eax \n\t" /* load columns into EAX */ 05332 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */ 05333 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 05334 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */ 05335 /* --- */ 05336 ".L10350: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 05337 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */ 05338 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05339 ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 05340 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 05341 /* --- 1 */ 05342 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05343 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05344 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05345 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05346 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05347 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05348 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05349 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05350 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05351 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05352 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05353 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05354 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05355 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05356 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05357 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05358 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05359 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05360 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05361 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05362 /* --- 2 */ 05363 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05364 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05365 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05366 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05367 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05368 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05369 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05370 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05371 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05372 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05373 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05374 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05375 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05376 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05377 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05378 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05379 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05380 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05381 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05382 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05383 /* --- 3 */ 05384 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05385 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05386 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05387 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05388 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05389 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05390 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05391 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05392 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05393 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05394 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05395 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05396 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05397 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05398 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05399 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05400 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05401 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05402 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05403 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05404 /* --- 4 */ 05405 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05406 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05407 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05408 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05409 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05410 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05411 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05412 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05413 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05414 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05415 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05416 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05417 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05418 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05419 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05420 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05421 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05422 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05423 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05424 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05425 /* --- 5 */ 05426 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05427 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05428 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05429 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05430 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05431 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05432 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05433 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05434 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05435 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05436 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05437 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05438 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05439 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05440 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05441 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05442 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05443 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05444 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05445 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05446 /* --- 6 */ 05447 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05448 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05449 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05450 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05451 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05452 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05453 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05454 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05455 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05456 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05457 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05458 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05459 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05460 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05461 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05462 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05463 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05464 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05465 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05466 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05467 /* --- 7 */ 05468 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05469 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05470 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05471 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05472 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05473 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05474 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05475 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05476 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05477 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05478 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05479 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05480 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05481 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05482 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05483 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05484 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05485 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05486 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05487 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05488 /* --- 8 */ 05489 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05490 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05491 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05492 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05493 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05494 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05495 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05496 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05497 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05498 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05499 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05500 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05501 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05502 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05503 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05504 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05505 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05506 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05507 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05508 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05509 /* --- 9 */ 05510 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05511 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05512 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05513 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05514 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05515 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05516 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05517 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05518 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05519 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05520 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05521 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05522 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05523 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05524 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05525 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05526 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05527 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05528 /* --- */ 05529 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 05530 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 05531 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 05532 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 05533 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 05534 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 05535 /* --- */ 05536 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 05537 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 05538 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 05539 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 05540 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 05541 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 05542 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 05543 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 05544 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 05545 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 05546 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 05547 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 05548 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 05549 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 05550 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 05551 /* -- */ 05552 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 05553 "sub $208, %%edx \n\t" /* EDX = Kernel address */ 05554 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05555 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05556 /* --- */ 05557 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05558 "jnz .L10352 \n\t" /* check loop termination, proceed if required */ 05559 "add $8, %%esi \n\t" /* move to the next row in Src */ 05560 "add $8, %%edi \n\t" /* move to the next row in Dest */ 05561 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 05562 "jnz .L10350 \n\t" /* check loop termination, proceed if required */ 05563 /* --- */ 05564 "emms \n\t" /* exit MMX state */ 05565 "popa \n\t":"=m" (Dest) /* %0 */ 05566 :"m"(Src), /* %1 */ 05567 "m"(rows), /* %2 */ 05568 "m"(columns), /* %3 */ 05569 "m"(Kernel), /* %4 */ 05570 "m"(Divisor) /* %5 */ 05571 ); 05572 #endif 05573 #endif 05574 return (0); 05575 } else { 05576 /* No non-MMX implementation yet */ 05577 return (-1); 05578 } 05579 } 05580 05595 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05596 signed short *Kernel, unsigned char NRightShift) 05597 { 05598 /* Validate input parameters */ 05599 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05600 return(-1); 05601 05602 if ((columns < 3) || (rows < 3) || (NRightShift > 7)) 05603 return (-1); 05604 05605 if ((SDL_imageFilterMMXdetect())) { 05606 #ifdef USE_MMX 05607 #if !defined(GCC__) 05608 __asm 05609 { 05610 pusha 05611 pxor mm0, mm0 /* zero MM0 */ 05612 xor ebx, ebx /* zero EBX */ 05613 mov bl, NRightShift /* load NRightShift into BL */ 05614 movd mm4, ebx /* copy NRightShift into MM4 */ 05615 mov edx, Kernel /* load Kernel address into EDX */ 05616 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */ 05617 add edx, 8 /* second row |K0 K1 K2 0| */ 05618 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 05619 add edx, 8 /* third row |K6 K7 K8 0| */ 05620 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */ 05621 /* ---, */ 05622 mov eax, columns /* load columns into EAX */ 05623 mov esi, Src /* ESI = Src row 0 address */ 05624 mov edi, Dest /* load Dest address to EDI */ 05625 add edi, eax /* EDI = EDI + columns */ 05626 inc edi /* 1 byte offset from the left edge */ 05627 mov edx, rows /* initialize ROWS counter */ 05628 sub edx, 2 /* do not use first and last row */ 05629 /* ---, */ 05630 L10360: 05631 mov ecx, eax /* initialize COLUMS counter */ 05632 sub ecx, 2 /* do not use first and last column */ 05633 align 16 /* 16 byte alignment of the loop entry */ 05634 L10362: 05635 /* ---, */ 05636 movq mm1, [esi] /* load 8 bytes of the image first row */ 05637 add esi, eax /* move one row below */ 05638 movq mm2, [esi] /* load 8 bytes of the image second row */ 05639 add esi, eax /* move one row below */ 05640 movq mm3, [esi] /* load 8 bytes of the image third row */ 05641 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05642 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */ 05643 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */ 05644 psrlw mm1, mm4 /* shift right each pixel NshiftRight times */ 05645 psrlw mm2, mm4 /* shift right each pixel NshiftRight times */ 05646 psrlw mm3, mm4 /* shift right each pixel NshiftRight times */ 05647 pmullw mm1, mm5 /* multiply words first row image*Kernel */ 05648 pmullw mm2, mm6 /* multiply words second row image*Kernel */ 05649 pmullw mm3, mm7 /* multiply words third row image*Kernel */ 05650 paddsw mm1, mm2 /* add 4 words of the first and second rows */ 05651 paddsw mm1, mm3 /* add 4 words of the third row and result */ 05652 movq mm2, mm1 /* copy MM1 into MM2 */ 05653 psrlq mm1, 32 /* shift 2 left words to the right */ 05654 paddsw mm1, mm2 /* add 2 left and 2 right result words */ 05655 movq mm3, mm1 /* copy MM1 into MM3 */ 05656 psrlq mm1, 16 /* shift 1 left word to the right */ 05657 paddsw mm1, mm3 /* add 1 left and 1 right result words */ 05658 packuswb mm1, mm0 /* pack shift result with saturation */ 05659 movd ebx, mm1 /* copy saturated result into EBX */ 05660 mov [edi], bl /* copy a byte result into Dest */ 05661 /* --, */ 05662 sub esi, eax /* move two rows up */ 05663 sub esi, eax 05664 inc esi /* move Src pointer to the next pixel */ 05665 inc edi /* move Dest pointer to the next pixel */ 05666 /* ---, */ 05667 dec ecx /* decrease loop counter COLUMNS */ 05668 jnz L10362 /* check loop termination, proceed if required */ 05669 add esi, 2 /* move to the next row in Src */ 05670 add edi, 2 /* move to the next row in Dest */ 05671 dec edx /* decrease loop counter ROWS */ 05672 jnz L10360 /* check loop termination, proceed if required */ 05673 /* ---, */ 05674 emms /* exit MMX state */ 05675 popa 05676 } 05677 #else 05678 asm volatile 05679 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05680 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05681 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 05682 "movd %%ebx, %%mm4 \n\t" /* copy NRightShift into MM4 */ 05683 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05684 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */ 05685 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */ 05686 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 05687 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */ 05688 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */ 05689 /* --- */ 05690 "mov %3, %%eax \n\t" /* load columns into EAX */ 05691 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 05692 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05693 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 05694 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 05695 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 05696 "sub $2, %%edx \n\t" /* do not use first and last row */ 05697 /* --- */ 05698 ".L10360: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 05699 "sub $2, %%ecx \n\t" /* do not use first and last column */ 05700 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05701 ".L10362: \n\t" 05702 /* --- */ 05703 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */ 05704 "add %%eax, %%esi \n\t" /* move one row below */ 05705 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */ 05706 "add %%eax, %%esi \n\t" /* move one row below */ 05707 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */ 05708 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05709 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */ 05710 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */ 05711 "psrlw %%mm4, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05712 "psrlw %%mm4, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05713 "psrlw %%mm4, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 05714 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */ 05715 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */ 05716 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */ 05717 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */ 05718 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */ 05719 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05720 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */ 05721 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */ 05722 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */ 05723 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */ 05724 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */ 05725 "packuswb %%mm0, %%mm1 \n\t" /* pack shift result with saturation */ 05726 "movd %%mm1, %%ebx \n\t" /* copy saturated result into EBX */ 05727 "mov %%bl, (%%edi) \n\t" /* copy a byte result into Dest */ 05728 /* -- */ 05729 "sub %%eax, %%esi \n\t" /* move two rows up */ 05730 "sub %%eax, %%esi \n\t" "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05731 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05732 /* --- */ 05733 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05734 "jnz .L10362 \n\t" /* check loop termination, proceed if required */ 05735 "add $2, %%esi \n\t" /* move to the next row in Src */ 05736 "add $2, %%edi \n\t" /* move to the next row in Dest */ 05737 "dec %%edx \n\t" /* decrease loop counter ROWS */ 05738 "jnz .L10360 \n\t" /* check loop termination, proceed if required */ 05739 /* --- */ 05740 "emms \n\t" /* exit MMX state */ 05741 "popa \n\t":"=m" (Dest) /* %0 */ 05742 :"m"(Src), /* %1 */ 05743 "m"(rows), /* %2 */ 05744 "m"(columns), /* %3 */ 05745 "m"(Kernel), /* %4 */ 05746 "m"(NRightShift) /* %5 */ 05747 ); 05748 #endif 05749 #endif 05750 return (0); 05751 } else { 05752 /* No non-MMX implementation yet */ 05753 return (-1); 05754 } 05755 } 05756 05771 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05772 signed short *Kernel, unsigned char NRightShift) 05773 { 05774 /* Validate input parameters */ 05775 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05776 return(-1); 05777 05778 if ((columns < 5) || (rows < 5) || (NRightShift > 7)) 05779 return (-1); 05780 05781 if ((SDL_imageFilterMMXdetect())) { 05782 #ifdef USE_MMX 05783 #if !defined(GCC__) 05784 __asm 05785 { 05786 pusha 05787 pxor mm0, mm0 /* zero MM0 */ 05788 xor ebx, ebx /* zero EBX */ 05789 mov bl, NRightShift /* load NRightShift into BL */ 05790 movd mm5, ebx /* copy NRightShift into MM5 */ 05791 mov edx, Kernel /* load Kernel address into EDX */ 05792 mov esi, Src /* load Src address to ESI */ 05793 mov edi, Dest /* load Dest address to EDI */ 05794 add edi, 2 /* 2 column offset from the left edge */ 05795 mov eax, columns /* load columns into EAX */ 05796 shl eax, 1 /* EAX = columns * 2 */ 05797 add edi, eax /* 2 row offset from the top edge */ 05798 shr eax, 1 /* EAX = columns */ 05799 mov ebx, rows /* initialize ROWS counter */ 05800 sub ebx, 4 /* do not use first 2 and last 2 rows */ 05801 /* ---, */ 05802 L10370: 05803 mov ecx, eax /* initialize COLUMNS counter */ 05804 sub ecx, 4 /* do not use first 2 and last 2 columns */ 05805 align 16 /* 16 byte alignment of the loop entry */ 05806 L10372: 05807 pxor mm7, mm7 /* zero MM7 (accumulator) */ 05808 movd mm6, esi /* save ESI in MM6 */ 05809 /* --- 1 */ 05810 movq mm1, [esi] /* load 8 bytes of the Src */ 05811 movq mm2, mm1 /* copy MM1 into MM2 */ 05812 add esi, eax /* move Src pointer 1 row below */ 05813 movq mm3, [edx] /* load 4 words of Kernel */ 05814 add edx, 8 /* move pointer to other 4 words */ 05815 movq mm4, [edx] /* load 4 words of Kernel */ 05816 add edx, 8 /* move pointer to other 4 words */ 05817 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05818 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05819 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05820 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05821 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05822 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05823 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05824 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05825 /* --- 2 */ 05826 movq mm1, [esi] /* load 8 bytes of the Src */ 05827 movq mm2, mm1 /* copy MM1 into MM2 */ 05828 add esi, eax /* move Src pointer 1 row below */ 05829 movq mm3, [edx] /* load 4 words of Kernel */ 05830 add edx, 8 /* move pointer to other 4 words */ 05831 movq mm4, [edx] /* load 4 words of Kernel */ 05832 add edx, 8 /* move pointer to other 4 words */ 05833 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05834 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05835 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05836 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05837 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05838 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05839 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05840 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05841 /* --- 3 */ 05842 movq mm1, [esi] /* load 8 bytes of the Src */ 05843 movq mm2, mm1 /* copy MM1 into MM2 */ 05844 add esi, eax /* move Src pointer 1 row below */ 05845 movq mm3, [edx] /* load 4 words of Kernel */ 05846 add edx, 8 /* move pointer to other 4 words */ 05847 movq mm4, [edx] /* load 4 words of Kernel */ 05848 add edx, 8 /* move pointer to other 4 words */ 05849 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05850 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05851 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05852 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05853 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05854 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05855 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05856 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05857 /* --- 4 */ 05858 movq mm1, [esi] /* load 8 bytes of the Src */ 05859 movq mm2, mm1 /* copy MM1 into MM2 */ 05860 add esi, eax /* move Src pointer 1 row below */ 05861 movq mm3, [edx] /* load 4 words of Kernel */ 05862 add edx, 8 /* move pointer to other 4 words */ 05863 movq mm4, [edx] /* load 4 words of Kernel */ 05864 add edx, 8 /* move pointer to other 4 words */ 05865 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05866 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05867 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05868 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05869 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05870 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05871 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05872 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05873 /* --- 5 */ 05874 movq mm1, [esi] /* load 8 bytes of the Src */ 05875 movq mm2, mm1 /* copy MM1 into MM2 */ 05876 movq mm3, [edx] /* load 4 words of Kernel */ 05877 add edx, 8 /* move pointer to other 4 words */ 05878 movq mm4, [edx] /* load 4 words of Kernel */ 05879 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05880 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05881 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05882 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05883 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05884 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05885 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05886 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05887 /* ---, */ 05888 movq mm3, mm7 /* copy MM7 into MM3 */ 05889 psrlq mm7, 32 /* shift 2 left words to the right */ 05890 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 05891 movq mm2, mm7 /* copy MM7 into MM2 */ 05892 psrlq mm7, 16 /* shift 1 left word to the right */ 05893 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 05894 movd mm1, eax /* save EAX in MM1 */ 05895 packuswb mm7, mm0 /* pack division result with saturation */ 05896 movd eax, mm7 /* copy saturated result into EAX */ 05897 mov [edi], al /* copy a byte result into Dest */ 05898 movd eax, mm1 /* restore saved EAX */ 05899 /* --, */ 05900 movd esi, mm6 /* move Src pointer to the top pixel */ 05901 sub edx, 72 /* EDX = Kernel address */ 05902 inc esi /* move Src pointer to the next pixel */ 05903 inc edi /* move Dest pointer to the next pixel */ 05904 /* ---, */ 05905 dec ecx /* decrease loop counter COLUMNS */ 05906 jnz L10372 /* check loop termination, proceed if required */ 05907 add esi, 4 /* move to the next row in Src */ 05908 add edi, 4 /* move to the next row in Dest */ 05909 dec ebx /* decrease loop counter ROWS */ 05910 jnz L10370 /* check loop termination, proceed if required */ 05911 /* ---, */ 05912 emms /* exit MMX state */ 05913 popa 05914 } 05915 #else 05916 asm volatile 05917 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05918 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05919 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 05920 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 05921 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05922 "mov %1, %%esi \n\t" /* load Src address to ESI */ 05923 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05924 "add $2, %%edi \n\t" /* 2 column offset from the left edge */ 05925 "mov %3, %%eax \n\t" /* load columns into EAX */ 05926 "shl $1, %%eax \n\t" /* EAX = columns * 2 */ 05927 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */ 05928 "shr $1, %%eax \n\t" /* EAX = columns */ 05929 "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 05930 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */ 05931 /* --- */ 05932 ".L10370: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 05933 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */ 05934 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05935 ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 05936 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 05937 /* --- 1 */ 05938 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05939 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05940 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05941 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05942 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05943 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05944 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05945 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05946 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05947 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05948 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05949 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05950 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05951 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05952 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05953 /* --- 2 */ 05954 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05955 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05956 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05957 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05958 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05959 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05960 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05961 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05962 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05963 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05964 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05965 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05966 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05967 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05968 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05969 /* --- 3 */ 05970 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05971 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05972 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05973 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05974 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05975 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05976 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05977 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05978 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05979 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05980 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05981 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05982 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05983 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05984 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05985 /* --- 4 */ 05986 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05987 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05988 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05989 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05990 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05991 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05992 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05993 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05994 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05995 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05996 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05997 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05998 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05999 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06000 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06001 /* --- 5 */ 06002 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06003 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06004 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06005 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06006 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06007 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06008 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06009 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06010 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06011 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06012 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06013 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06014 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06015 /* --- */ 06016 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06017 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06018 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 06019 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 06020 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 06021 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 06022 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 06023 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 06024 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 06025 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 06026 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 06027 /* -- */ 06028 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 06029 "sub $72, %%edx \n\t" /* EDX = Kernel address */ 06030 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 06031 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 06032 /* --- */ 06033 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 06034 "jnz .L10372 \n\t" /* check loop termination, proceed if required */ 06035 "add $4, %%esi \n\t" /* move to the next row in Src */ 06036 "add $4, %%edi \n\t" /* move to the next row in Dest */ 06037 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 06038 "jnz .L10370 \n\t" /* check loop termination, proceed if required */ 06039 /* --- */ 06040 "emms \n\t" /* exit MMX state */ 06041 "popa \n\t":"=m" (Dest) /* %0 */ 06042 :"m"(Src), /* %1 */ 06043 "m"(rows), /* %2 */ 06044 "m"(columns), /* %3 */ 06045 "m"(Kernel), /* %4 */ 06046 "m"(NRightShift) /* %5 */ 06047 ); 06048 #endif 06049 #endif 06050 return (0); 06051 } else { 06052 /* No non-MMX implementation yet */ 06053 return (-1); 06054 } 06055 } 06056 06071 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 06072 signed short *Kernel, unsigned char NRightShift) 06073 { 06074 /* Validate input parameters */ 06075 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 06076 return(-1); 06077 06078 if ((columns < 7) || (rows < 7) || (NRightShift > 7)) 06079 return (-1); 06080 06081 if ((SDL_imageFilterMMXdetect())) { 06082 #ifdef USE_MMX 06083 #if !defined(GCC__) 06084 __asm 06085 { 06086 pusha 06087 pxor mm0, mm0 /* zero MM0 */ 06088 xor ebx, ebx /* zero EBX */ 06089 mov bl, NRightShift /* load NRightShift into BL */ 06090 movd mm5, ebx /* copy NRightShift into MM5 */ 06091 mov edx, Kernel /* load Kernel address into EDX */ 06092 mov esi, Src /* load Src address to ESI */ 06093 mov edi, Dest /* load Dest address to EDI */ 06094 add edi, 3 /* 3 column offset from the left edge */ 06095 mov eax, columns /* load columns into EAX */ 06096 add edi, eax /* 3 row offset from the top edge */ 06097 add edi, eax 06098 add edi, eax 06099 mov ebx, rows /* initialize ROWS counter */ 06100 sub ebx, 6 /* do not use first 3 and last 3 rows */ 06101 /* ---, */ 06102 L10380: 06103 mov ecx, eax /* initialize COLUMNS counter */ 06104 sub ecx, 6 /* do not use first 3 and last 3 columns */ 06105 align 16 /* 16 byte alignment of the loop entry */ 06106 L10382: 06107 pxor mm7, mm7 /* zero MM7 (accumulator) */ 06108 movd mm6, esi /* save ESI in MM6 */ 06109 /* --- 1 */ 06110 movq mm1, [esi] /* load 8 bytes of the Src */ 06111 movq mm2, mm1 /* copy MM1 into MM2 */ 06112 add esi, eax /* move Src pointer 1 row below */ 06113 movq mm3, [edx] /* load 4 words of Kernel */ 06114 add edx, 8 /* move pointer to other 4 words */ 06115 movq mm4, [edx] /* load 4 words of Kernel */ 06116 add edx, 8 /* move pointer to other 4 words */ 06117 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06118 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06119 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06120 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06121 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06122 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06123 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06124 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06125 /* --- 2 */ 06126 movq mm1, [esi] /* load 8 bytes of the Src */ 06127 movq mm2, mm1 /* copy MM1 into MM2 */ 06128 add esi, eax /* move Src pointer 1 row below */ 06129 movq mm3, [edx] /* load 4 words of Kernel */ 06130 add edx, 8 /* move pointer to other 4 words */ 06131 movq mm4, [edx] /* load 4 words of Kernel */ 06132 add edx, 8 /* move pointer to other 4 words */ 06133 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06134 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06135 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06136 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06137 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06138 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06139 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06140 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06141 /* --- 3 */ 06142 movq mm1, [esi] /* load 8 bytes of the Src */ 06143 movq mm2, mm1 /* copy MM1 into MM2 */ 06144 add esi, eax /* move Src pointer 1 row below */ 06145 movq mm3, [edx] /* load 4 words of Kernel */ 06146 add edx, 8 /* move pointer to other 4 words */ 06147 movq mm4, [edx] /* load 4 words of Kernel */ 06148 add edx, 8 /* move pointer to other 4 words */ 06149 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06150 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06151 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06152 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06153 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06154 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06155 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06156 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06157 /* --- 4 */ 06158 movq mm1, [esi] /* load 8 bytes of the Src */ 06159 movq mm2, mm1 /* copy MM1 into MM2 */ 06160 add esi, eax /* move Src pointer 1 row below */ 06161 movq mm3, [edx] /* load 4 words of Kernel */ 06162 add edx, 8 /* move pointer to other 4 words */ 06163 movq mm4, [edx] /* load 4 words of Kernel */ 06164 add edx, 8 /* move pointer to other 4 words */ 06165 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06166 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06167 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06168 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06169 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06170 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06171 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06172 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06173 /* --- 5 */ 06174 movq mm1, [esi] /* load 8 bytes of the Src */ 06175 movq mm2, mm1 /* copy MM1 into MM2 */ 06176 add esi, eax /* move Src pointer 1 row below */ 06177 movq mm3, [edx] /* load 4 words of Kernel */ 06178 add edx, 8 /* move pointer to other 4 words */ 06179 movq mm4, [edx] /* load 4 words of Kernel */ 06180 add edx, 8 /* move pointer to other 4 words */ 06181 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06182 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06183 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06184 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06185 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06186 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06187 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06188 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06189 /* --- 6 */ 06190 movq mm1, [esi] /* load 8 bytes of the Src */ 06191 movq mm2, mm1 /* copy MM1 into MM2 */ 06192 add esi, eax /* move Src pointer 1 row below */ 06193 movq mm3, [edx] /* load 4 words of Kernel */ 06194 add edx, 8 /* move pointer to other 4 words */ 06195 movq mm4, [edx] /* load 4 words of Kernel */ 06196 add edx, 8 /* move pointer to other 4 words */ 06197 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06198 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06199 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06200 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06201 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06202 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06203 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06204 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06205 /* --- 7 */ 06206 movq mm1, [esi] /* load 8 bytes of the Src */ 06207 movq mm2, mm1 /* copy MM1 into MM2 */ 06208 movq mm3, [edx] /* load 4 words of Kernel */ 06209 add edx, 8 /* move pointer to other 4 words */ 06210 movq mm4, [edx] /* load 4 words of Kernel */ 06211 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06212 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06213 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06214 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06215 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06216 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06217 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06218 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06219 /* ---, */ 06220 movq mm3, mm7 /* copy MM7 into MM3 */ 06221 psrlq mm7, 32 /* shift 2 left words to the right */ 06222 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 06223 movq mm2, mm7 /* copy MM7 into MM2 */ 06224 psrlq mm7, 16 /* shift 1 left word to the right */ 06225 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 06226 movd mm1, eax /* save EAX in MM1 */ 06227 packuswb mm7, mm0 /* pack division result with saturation */ 06228 movd eax, mm7 /* copy saturated result into EAX */ 06229 mov [edi], al /* copy a byte result into Dest */ 06230 movd eax, mm1 /* restore saved EAX */ 06231 /* --, */ 06232 movd esi, mm6 /* move Src pointer to the top pixel */ 06233 sub edx, 104 /* EDX = Kernel address */ 06234 inc esi /* move Src pointer to the next pixel */ 06235 inc edi /* move Dest pointer to the next pixel */ 06236 /* ---, */ 06237 dec ecx /* decrease loop counter COLUMNS */ 06238 jnz L10382 /* check loop termination, proceed if required */ 06239 add esi, 6 /* move to the next row in Src */ 06240 add edi, 6 /* move to the next row in Dest */ 06241 dec ebx /* decrease loop counter ROWS */ 06242 jnz L10380 /* check loop termination, proceed if required */ 06243 /* ---, */ 06244 emms /* exit MMX state */ 06245 popa 06246 } 06247 #else 06248 asm volatile 06249 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06250 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 06251 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 06252 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 06253 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 06254 "mov %1, %%esi \n\t" /* load Src address to ESI */ 06255 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06256 "add $3, %%edi \n\t" /* 3 column offset from the left edge */ 06257 "mov %3, %%eax \n\t" /* load columns into EAX */ 06258 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */ 06259 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 06260 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */ 06261 /* --- */ 06262 ".L10380: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 06263 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */ 06264 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06265 ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 06266 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 06267 /* --- 1 */ 06268 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06269 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06270 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06271 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06272 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06273 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06274 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06275 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06276 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06277 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06278 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06279 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06280 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06281 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06282 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06283 /* --- 2 */ 06284 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06285 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06286 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06287 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06288 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06289 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06290 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06291 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06292 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06293 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06294 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06295 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06296 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06297 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06298 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06299 /* --- 3 */ 06300 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06301 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06302 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06303 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06304 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06305 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06306 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06307 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06308 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06309 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06310 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06311 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06312 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06313 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06314 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06315 /* --- 4 */ 06316 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06317 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06318 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06319 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06320 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06321 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06322 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06323 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06324 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06325 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06326 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06327 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06328 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06329 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06330 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06331 /* --- 5 */ 06332 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06333 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06334 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06335 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06336 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06337 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06338 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06339 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06340 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06341 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06342 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06343 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06344 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06345 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06346 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06347 /* --- 6 */ 06348 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06349 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06350 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06351 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06352 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06353 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06354 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06355 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06356 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06357 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06358 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06359 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06360 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06361 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06362 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06363 /* --- 7 */ 06364 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06365 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06366 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06367 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06368 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06369 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06370 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06371 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06372 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06373 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06374 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06375 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06376 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06377 /* --- */ 06378 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06379 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06380 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 06381 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 06382 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 06383 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 06384 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 06385 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 06386 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 06387 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 06388 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 06389 /* -- */ 06390 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 06391 "sub $104, %%edx \n\t" /* EDX = Kernel address */ 06392 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 06393 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 06394 /* --- */ 06395 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 06396 "jnz .L10382 \n\t" /* check loop termination, proceed if required */ 06397 "add $6, %%esi \n\t" /* move to the next row in Src */ 06398 "add $6, %%edi \n\t" /* move to the next row in Dest */ 06399 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 06400 "jnz .L10380 \n\t" /* check loop termination, proceed if required */ 06401 /* --- */ 06402 "emms \n\t" /* exit MMX state */ 06403 "popa \n\t":"=m" (Dest) /* %0 */ 06404 :"m"(Src), /* %1 */ 06405 "m"(rows), /* %2 */ 06406 "m"(columns), /* %3 */ 06407 "m"(Kernel), /* %4 */ 06408 "m"(NRightShift) /* %5 */ 06409 ); 06410 #endif 06411 #endif 06412 return (0); 06413 } else { 06414 /* No non-MMX implementation yet */ 06415 return (-1); 06416 } 06417 } 06418 06433 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 06434 signed short *Kernel, unsigned char NRightShift) 06435 { 06436 /* Validate input parameters */ 06437 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 06438 return(-1); 06439 06440 if ((columns < 9) || (rows < 9) || (NRightShift > 7)) 06441 return (-1); 06442 06443 if ((SDL_imageFilterMMXdetect())) { 06444 #ifdef USE_MMX 06445 #if !defined(GCC__) 06446 __asm 06447 { 06448 pusha 06449 pxor mm0, mm0 /* zero MM0 */ 06450 xor ebx, ebx /* zero EBX */ 06451 mov bl, NRightShift /* load NRightShift into BL */ 06452 movd mm5, ebx /* copy NRightShift into MM5 */ 06453 mov edx, Kernel /* load Kernel address into EDX */ 06454 mov esi, Src /* load Src address to ESI */ 06455 mov edi, Dest /* load Dest address to EDI */ 06456 add edi, 4 /* 4 column offset from the left edge */ 06457 mov eax, columns /* load columns into EAX */ 06458 add edi, eax /* 4 row offset from the top edge */ 06459 add edi, eax 06460 add edi, eax 06461 add edi, eax 06462 mov ebx, rows /* initialize ROWS counter */ 06463 sub ebx, 8 /* do not use first 4 and last 4 rows */ 06464 /* ---, */ 06465 L10390: 06466 mov ecx, eax /* initialize COLUMNS counter */ 06467 sub ecx, 8 /* do not use first 4 and last 4 columns */ 06468 align 16 /* 16 byte alignment of the loop entry */ 06469 L10392: 06470 pxor mm7, mm7 /* zero MM7 (accumulator) */ 06471 movd mm6, esi /* save ESI in MM6 */ 06472 /* --- 1 */ 06473 movq mm1, [esi] /* load 8 bytes of the Src */ 06474 movq mm2, mm1 /* copy MM1 into MM2 */ 06475 inc esi /* move pointer to the next 8 bytes of Src */ 06476 movq mm3, [edx] /* load 4 words of Kernel */ 06477 add edx, 8 /* move pointer to other 4 words */ 06478 movq mm4, [edx] /* load 4 words of Kernel */ 06479 add edx, 8 /* move pointer to other 4 words */ 06480 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06481 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06482 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06483 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06484 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06485 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06486 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06487 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06488 movq mm1, [esi] /* load 8 bytes of the Src */ 06489 dec esi 06490 add esi, eax /* move Src pointer 1 row below */ 06491 movq mm3, [edx] /* load 4 words of Kernel */ 06492 add edx, 8 /* move pointer to other 4 words */ 06493 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06494 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06495 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06496 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06497 /* --- 2 */ 06498 movq mm1, [esi] /* load 8 bytes of the Src */ 06499 movq mm2, mm1 /* copy MM1 into MM2 */ 06500 inc esi /* move pointer to the next 8 bytes of Src */ 06501 movq mm3, [edx] /* load 4 words of Kernel */ 06502 add edx, 8 /* move pointer to other 4 words */ 06503 movq mm4, [edx] /* load 4 words of Kernel */ 06504 add edx, 8 /* move pointer to other 4 words */ 06505 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06506 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06507 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06508 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06509 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06510 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06511 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06512 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06513 movq mm1, [esi] /* load 8 bytes of the Src */ 06514 dec esi 06515 add esi, eax /* move Src pointer 1 row below */ 06516 movq mm3, [edx] /* load 4 words of Kernel */ 06517 add edx, 8 /* move pointer to other 4 words */ 06518 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06519 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06520 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06521 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06522 /* --- 3 */ 06523 movq mm1, [esi] /* load 8 bytes of the Src */ 06524 movq mm2, mm1 /* copy MM1 into MM2 */ 06525 inc esi /* move pointer to the next 8 bytes of Src */ 06526 movq mm3, [edx] /* load 4 words of Kernel */ 06527 add edx, 8 /* move pointer to other 4 words */ 06528 movq mm4, [edx] /* load 4 words of Kernel */ 06529 add edx, 8 /* move pointer to other 4 words */ 06530 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06531 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06532 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06533 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06534 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06535 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06536 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06537 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06538 movq mm1, [esi] /* load 8 bytes of the Src */ 06539 dec esi 06540 add esi, eax /* move Src pointer 1 row below */ 06541 movq mm3, [edx] /* load 4 words of Kernel */ 06542 add edx, 8 /* move pointer to other 4 words */ 06543 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06544 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06545 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06546 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06547 /* --- 4 */ 06548 movq mm1, [esi] /* load 8 bytes of the Src */ 06549 movq mm2, mm1 /* copy MM1 into MM2 */ 06550 inc esi /* move pointer to the next 8 bytes of Src */ 06551 movq mm3, [edx] /* load 4 words of Kernel */ 06552 add edx, 8 /* move pointer to other 4 words */ 06553 movq mm4, [edx] /* load 4 words of Kernel */ 06554 add edx, 8 /* move pointer to other 4 words */ 06555 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06556 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06557 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06558 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06559 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06560 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06561 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06562 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06563 movq mm1, [esi] /* load 8 bytes of the Src */ 06564 dec esi 06565 add esi, eax /* move Src pointer 1 row below */ 06566 movq mm3, [edx] /* load 4 words of Kernel */ 06567 add edx, 8 /* move pointer to other 4 words */ 06568 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06569 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06570 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06571 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06572 /* --- 5 */ 06573 movq mm1, [esi] /* load 8 bytes of the Src */ 06574 movq mm2, mm1 /* copy MM1 into MM2 */ 06575 inc esi /* move pointer to the next 8 bytes of Src */ 06576 movq mm3, [edx] /* load 4 words of Kernel */ 06577 add edx, 8 /* move pointer to other 4 words */ 06578 movq mm4, [edx] /* load 4 words of Kernel */ 06579 add edx, 8 /* move pointer to other 4 words */ 06580 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06581 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06582 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06583 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06584 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06585 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06586 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06587 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06588 movq mm1, [esi] /* load 8 bytes of the Src */ 06589 dec esi 06590 add esi, eax /* move Src pointer 1 row below */ 06591 movq mm3, [edx] /* load 4 words of Kernel */ 06592 add edx, 8 /* move pointer to other 4 words */ 06593 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06594 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06595 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06596 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06597 /* --- 6 */ 06598 movq mm1, [esi] /* load 8 bytes of the Src */ 06599 movq mm2, mm1 /* copy MM1 into MM2 */ 06600 inc esi /* move pointer to the next 8 bytes of Src */ 06601 movq mm3, [edx] /* load 4 words of Kernel */ 06602 add edx, 8 /* move pointer to other 4 words */ 06603 movq mm4, [edx] /* load 4 words of Kernel */ 06604 add edx, 8 /* move pointer to other 4 words */ 06605 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06606 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06607 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06608 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06609 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06610 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06611 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06612 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06613 movq mm1, [esi] /* load 8 bytes of the Src */ 06614 dec esi 06615 add esi, eax /* move Src pointer 1 row below */ 06616 movq mm3, [edx] /* load 4 words of Kernel */ 06617 add edx, 8 /* move pointer to other 4 words */ 06618 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06619 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06620 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06621 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06622 /* --- 7 */ 06623 movq mm1, [esi] /* load 8 bytes of the Src */ 06624 movq mm2, mm1 /* copy MM1 into MM2 */ 06625 inc esi /* move pointer to the next 8 bytes of Src */ 06626 movq mm3, [edx] /* load 4 words of Kernel */ 06627 add edx, 8 /* move pointer to other 4 words */ 06628 movq mm4, [edx] /* load 4 words of Kernel */ 06629 add edx, 8 /* move pointer to other 4 words */ 06630 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06631 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06632 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06633 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06634 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06635 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06636 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06637 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06638 movq mm1, [esi] /* load 8 bytes of the Src */ 06639 dec esi 06640 add esi, eax /* move Src pointer 1 row below */ 06641 movq mm3, [edx] /* load 4 words of Kernel */ 06642 add edx, 8 /* move pointer to other 4 words */ 06643 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06644 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06645 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06646 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06647 /* --- 8 */ 06648 movq mm1, [esi] /* load 8 bytes of the Src */ 06649 movq mm2, mm1 /* copy MM1 into MM2 */ 06650 inc esi /* move pointer to the next 8 bytes of Src */ 06651 movq mm3, [edx] /* load 4 words of Kernel */ 06652 add edx, 8 /* move pointer to other 4 words */ 06653 movq mm4, [edx] /* load 4 words of Kernel */ 06654 add edx, 8 /* move pointer to other 4 words */ 06655 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06656 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06657 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06658 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06659 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06660 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06661 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06662 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06663 movq mm1, [esi] /* load 8 bytes of the Src */ 06664 dec esi 06665 add esi, eax /* move Src pointer 1 row below */ 06666 movq mm3, [edx] /* load 4 words of Kernel */ 06667 add edx, 8 /* move pointer to other 4 words */ 06668 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06669 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06670 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06671 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06672 /* --- 9 */ 06673 movq mm1, [esi] /* load 8 bytes of the Src */ 06674 movq mm2, mm1 /* copy MM1 into MM2 */ 06675 inc esi /* move pointer to the next 8 bytes of Src */ 06676 movq mm3, [edx] /* load 4 words of Kernel */ 06677 add edx, 8 /* move pointer to other 4 words */ 06678 movq mm4, [edx] /* load 4 words of Kernel */ 06679 add edx, 8 /* move pointer to other 4 words */ 06680 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06681 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06682 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06683 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06684 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06685 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06686 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06687 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06688 movq mm1, [esi] /* load 8 bytes of the Src */ 06689 movq mm3, [edx] /* load 4 words of Kernel */ 06690 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06691 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06692 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06693 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06694 /* ---, */ 06695 movq mm3, mm7 /* copy MM7 into MM3 */ 06696 psrlq mm7, 32 /* shift 2 left words to the right */ 06697 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 06698 movq mm2, mm7 /* copy MM7 into MM2 */ 06699 psrlq mm7, 16 /* shift 1 left word to the right */ 06700 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 06701 movd mm1, eax /* save EAX in MM1 */ 06702 packuswb mm7, mm0 /* pack division result with saturation */ 06703 movd eax, mm7 /* copy saturated result into EAX */ 06704 mov [edi], al /* copy a byte result into Dest */ 06705 movd eax, mm1 /* restore saved EAX */ 06706 /* --, */ 06707 movd esi, mm6 /* move Src pointer to the top pixel */ 06708 sub edx, 208 /* EDX = Kernel address */ 06709 inc esi /* move Src pointer to the next pixel */ 06710 inc edi /* move Dest pointer to the next pixel */ 06711 /* ---, */ 06712 dec ecx /* decrease loop counter COLUMNS */ 06713 jnz L10392 /* check loop termination, proceed if required */ 06714 add esi, 8 /* move to the next row in Src */ 06715 add edi, 8 /* move to the next row in Dest */ 06716 dec ebx /* decrease loop counter ROWS */ 06717 jnz L10390 /* check loop termination, proceed if required */ 06718 /* ---, */ 06719 emms /* exit MMX state */ 06720 popa 06721 } 06722 #else 06723 asm volatile 06724 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06725 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 06726 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 06727 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 06728 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 06729 "mov %1, %%esi \n\t" /* load Src address to ESI */ 06730 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06731 "add $4, %%edi \n\t" /* 4 column offset from the left edge */ 06732 "mov %3, %%eax \n\t" /* load columns into EAX */ 06733 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */ 06734 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 06735 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */ 06736 /* --- */ 06737 ".L10390: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 06738 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */ 06739 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06740 ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 06741 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 06742 /* --- 1 */ 06743 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06744 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06745 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06746 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06747 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06748 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06749 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06750 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06751 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06752 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06753 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06754 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06755 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06756 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06757 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06758 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06759 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06760 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06761 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06762 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06763 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06764 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06765 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06766 /* --- 2 */ 06767 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06768 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06769 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06770 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06771 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06772 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06773 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06774 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06775 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06776 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06777 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06778 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06779 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06780 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06781 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06782 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06783 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06784 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06785 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06786 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06787 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06788 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06789 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06790 /* --- 3 */ 06791 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06792 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06793 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06794 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06795 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06796 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06797 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06798 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06799 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06800 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06801 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06802 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06803 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06804 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06805 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06806 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06807 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06808 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06809 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06810 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06811 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06812 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06813 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06814 /* --- 4 */ 06815 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06816 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06817 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06818 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06819 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06820 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06821 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06822 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06823 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06824 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06825 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06826 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06827 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06828 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06829 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06830 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06831 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06832 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06833 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06834 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06835 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06836 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06837 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06838 /* --- 5 */ 06839 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06840 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06841 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06842 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06843 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06844 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06845 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06846 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06847 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06848 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06849 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06850 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06851 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06852 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06853 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06854 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06855 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06856 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06857 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06858 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06859 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06860 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06861 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06862 /* --- 6 */ 06863 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06864 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06865 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06866 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06867 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06868 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06869 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06870 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06871 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06872 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06873 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06874 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06875 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06876 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06877 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06878 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06879 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06880 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06881 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06882 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06883 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06884 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06885 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06886 /* --- 7 */ 06887 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06888 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06889 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06890 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06891 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06892 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06893 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06894 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06895 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06896 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06897 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06898 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06899 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06900 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06901 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06902 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06903 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06904 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06905 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06906 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06907 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06908 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06909 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06910 /* --- 8 */ 06911 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06912 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06913 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06914 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06915 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06916 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06917 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06918 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06919 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06920 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06921 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06922 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06923 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06924 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06925 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06926 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06927 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06928 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06929 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06930 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06931 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06932 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06933 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06934 /* --- 9 */ 06935 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06936 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06937 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06938 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06939 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06940 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06941 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06942 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06943 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06944 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06945 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06946 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06947 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06948 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06949 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06950 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06951 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06952 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06953 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06954 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06955 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06956 /* --- */ 06957 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06958 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06959 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 06960 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 06961 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 06962 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 06963 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 06964 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 06965 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 06966 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 06967 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 06968 /* -- */ 06969 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 06970 "sub $208, %%edx \n\t" /* EDX = Kernel address */ 06971 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 06972 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 06973 /* --- */ 06974 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 06975 "jnz .L10392 \n\t" /* check loop termination, proceed if required */ 06976 "add $8, %%esi \n\t" /* move to the next row in Src */ 06977 "add $8, %%edi \n\t" /* move to the next row in Dest */ 06978 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 06979 "jnz .L10390 \n\t" /* check loop termination, proceed if required */ 06980 /* --- */ 06981 "emms \n\t" /* exit MMX state */ 06982 "popa \n\t":"=m" (Dest) /* %0 */ 06983 :"m"(Src), /* %1 */ 06984 "m"(rows), /* %2 */ 06985 "m"(columns), /* %3 */ 06986 "m"(Kernel), /* %4 */ 06987 "m"(NRightShift) /* %5 */ 06988 ); 06989 #endif 06990 #endif 06991 return (0); 06992 } else { 06993 /* No non-MMX implementation yet */ 06994 return (-1); 06995 } 06996 } 06997 06998 /* ------------------------------------------------------------------------------------ */ 06999 07012 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns) 07013 { 07014 /* Validate input parameters */ 07015 if ((Src == NULL) || (Dest == NULL)) 07016 return(-1); 07017 07018 if ((columns < 8) || (rows < 3)) 07019 return (-1); 07020 07021 if ((SDL_imageFilterMMXdetect())) { 07022 #ifdef USE_MMX 07023 #if !defined(GCC__) 07024 __asm 07025 { 07026 pusha 07027 pxor mm0, mm0 /* zero MM0 */ 07028 mov eax, columns /* load columns into EAX */ 07029 /* ---, */ 07030 mov esi, Src /* ESI = Src row 0 address */ 07031 mov edi, Dest /* load Dest address to EDI */ 07032 add edi, eax /* EDI = EDI + columns */ 07033 inc edi /* 1 byte offset from the left edge */ 07034 mov edx, rows /* initialize ROWS counter */ 07035 sub edx, 2 /* do not use first and last rows */ 07036 /* ---, */ 07037 L10400: 07038 mov ecx, eax /* initialize COLUMS counter */ 07039 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */ 07040 mov ebx, esi /* save ESI in EBX */ 07041 movd mm1, edi /* save EDI in MM1 */ 07042 align 16 /* 16 byte alignment of the loop entry */ 07043 L10402: 07044 /* ---, */ 07045 movq mm4, [esi] /* load 8 bytes from Src */ 07046 movq mm5, mm4 /* save MM4 in MM5 */ 07047 add esi, 2 /* move ESI pointer 2 bytes right */ 07048 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */ 07049 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */ 07050 movq mm6, [esi] /* load 8 bytes from Src */ 07051 movq mm7, mm6 /* save MM6 in MM7 */ 07052 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07053 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */ 07054 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */ 07055 add esi, eax /* move to the next row of Src */ 07056 movq mm2, [esi] /* load 8 bytes from Src */ 07057 movq mm3, mm2 /* save MM2 in MM3 */ 07058 add esi, 2 /* move ESI pointer 2 bytes right */ 07059 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07060 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07061 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07062 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07063 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07064 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07065 movq mm2, [esi] /* load 8 bytes from Src */ 07066 movq mm3, mm2 /* save MM2 in MM3 */ 07067 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07068 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07069 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07070 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07071 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07072 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07073 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07074 add esi, eax /* move to the next row of Src */ 07075 movq mm2, [esi] /* load 8 bytes from Src */ 07076 movq mm3, mm2 /* save MM2 in MM3 */ 07077 add esi, 2 /* move ESI pointer 2 bytes right */ 07078 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07079 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07080 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07081 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07082 movq mm2, [esi] /* load 8 bytes from Src */ 07083 movq mm3, mm2 /* save MM2 in MM3 */ 07084 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07085 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07086 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07087 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07088 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07089 /* ---, */ 07090 movq mm2, mm4 /* copy MM4 into MM2 */ 07091 psrlq mm4, 32 /* shift 2 left words to the right */ 07092 psubw mm4, mm2 /* MM4 = MM4 - MM2 */ 07093 movq mm3, mm6 /* copy MM6 into MM3 */ 07094 psrlq mm6, 32 /* shift 2 left words to the right */ 07095 psubw mm6, mm3 /* MM6 = MM6 - MM3 */ 07096 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */ 07097 movq mm2, mm5 /* copy MM6 into MM2 */ 07098 psrlq mm5, 32 /* shift 2 left words to the right */ 07099 psubw mm5, mm2 /* MM5 = MM5 - MM2 */ 07100 movq mm3, mm7 /* copy MM7 into MM3 */ 07101 psrlq mm7, 32 /* shift 2 left words to the right */ 07102 psubw mm7, mm3 /* MM7 = MM7 - MM3 */ 07103 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */ 07104 /* Take abs values of MM4 and MM5 */ 07105 movq mm6, mm4 /* copy MM4 into MM6 */ 07106 movq mm7, mm5 /* copy MM5 into MM7 */ 07107 psraw mm6, 15 /* fill MM6 words with word sign bit */ 07108 psraw mm7, 15 /* fill MM7 words with word sign bit */ 07109 pxor mm4, mm6 /* take 1's compliment of only neg words */ 07110 pxor mm5, mm7 /* take 1's compliment of only neg words */ 07111 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 07112 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */ 07113 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */ 07114 movq [edi], mm4 /* store result in Dest */ 07115 /* ---, */ 07116 sub esi, eax /* move to the current top row in Src */ 07117 sub esi, eax 07118 add esi, 8 /* move Src pointer to the next 8 pixels */ 07119 add edi, 8 /* move Dest pointer to the next 8 pixels */ 07120 /* ---, */ 07121 dec ecx /* decrease loop counter COLUMNS */ 07122 jnz L10402 /* check loop termination, proceed if required */ 07123 mov esi, ebx /* restore most left current row Src address */ 07124 movd edi, mm1 /* restore most left current row Dest address */ 07125 add esi, eax /* move to the next row in Src */ 07126 add edi, eax /* move to the next row in Dest */ 07127 dec edx /* decrease loop counter ROWS */ 07128 jnz L10400 /* check loop termination, proceed if required */ 07129 /* ---, */ 07130 emms /* exit MMX state */ 07131 popa 07132 } 07133 #else 07134 asm volatile 07135 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 07136 "mov %3, %%eax \n\t" /* load columns into EAX */ 07137 /* --- */ 07138 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 07139 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 07140 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 07141 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 07142 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 07143 "sub $2, %%edx \n\t" /* do not use first and last rows */ 07144 /* --- */ 07145 ".L10400: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 07146 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */ 07147 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */ 07148 "movd %%edi, %%mm1 \n\t" /* save EDI in MM1 */ 07149 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 07150 ".L10402: \n\t" 07151 /* --- */ 07152 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */ 07153 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */ 07154 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07155 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */ 07156 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */ 07157 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */ 07158 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */ 07159 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07160 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */ 07161 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */ 07162 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07163 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07164 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07165 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07166 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07167 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07168 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07169 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07170 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07171 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07172 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07173 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07174 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07175 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07176 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07177 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07178 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07179 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07180 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07181 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07182 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07183 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07184 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07185 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07186 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07187 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07188 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07189 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07190 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07191 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07192 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07193 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07194 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07195 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07196 /* --- */ 07197 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */ 07198 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */ 07199 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */ 07200 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */ 07201 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */ 07202 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */ 07203 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */ 07204 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */ 07205 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */ 07206 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */ 07207 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 07208 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 07209 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */ 07210 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */ 07211 /* Take abs values of MM4 and MM5 */ 07212 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */ 07213 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */ 07214 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */ 07215 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */ 07216 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 07217 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */ 07218 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07219 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07220 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */ 07221 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */ 07222 /* --- */ 07223 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */ 07224 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */ 07225 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */ 07226 /* --- */ 07227 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 07228 "jnz .L10402 \n\t" /* check loop termination, proceed if required */ 07229 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */ 07230 "movd %%mm1, %%edi \n\t" /* restore most left current row Dest address */ 07231 "add %%eax, %%esi \n\t" /* move to the next row in Src */ 07232 "add %%eax, %%edi \n\t" /* move to the next row in Dest */ 07233 "dec %%edx \n\t" /* decrease loop counter ROWS */ 07234 "jnz .L10400 \n\t" /* check loop termination, proceed if required */ 07235 /* --- */ 07236 "emms \n\t" /* exit MMX state */ 07237 "popa \n\t":"=m" (Dest) /* %0 */ 07238 :"m"(Src), /* %1 */ 07239 "m"(rows), /* %2 */ 07240 "m"(columns) /* %3 */ 07241 ); 07242 #endif 07243 #endif 07244 return (0); 07245 } else { 07246 /* No non-MMX implementation yet */ 07247 return (-1); 07248 } 07249 } 07250 07264 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 07265 unsigned char NRightShift) 07266 { 07267 /* Validate input parameters */ 07268 if ((Src == NULL) || (Dest == NULL)) 07269 return(-1); 07270 if ((columns < 8) || (rows < 3) || (NRightShift > 7)) 07271 return (-1); 07272 07273 if ((SDL_imageFilterMMXdetect())) { 07274 #ifdef USE_MMX 07275 #if !defined(GCC__) 07276 __asm 07277 { 07278 pusha 07279 pxor mm0, mm0 /* zero MM0 */ 07280 mov eax, columns /* load columns into EAX */ 07281 xor ebx, ebx /* zero EBX */ 07282 mov bl, NRightShift /* load NRightShift into BL */ 07283 movd mm1, ebx /* copy NRightShift into MM1 */ 07284 /* ---, */ 07285 mov esi, Src /* ESI = Src row 0 address */ 07286 mov edi, Dest /* load Dest address to EDI */ 07287 add edi, eax /* EDI = EDI + columns */ 07288 inc edi /* 1 byte offset from the left edge */ 07289 /* initialize ROWS counter */ 07290 sub rows, 2 /* do not use first and last rows */ 07291 /* ---, */ 07292 L10410: 07293 mov ecx, eax /* initialize COLUMS counter */ 07294 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */ 07295 mov ebx, esi /* save ESI in EBX */ 07296 mov edx, edi /* save EDI in EDX */ 07297 align 16 /* 16 byte alignment of the loop entry */ 07298 L10412: 07299 /* ---, */ 07300 movq mm4, [esi] /* load 8 bytes from Src */ 07301 movq mm5, mm4 /* save MM4 in MM5 */ 07302 add esi, 2 /* move ESI pointer 2 bytes right */ 07303 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */ 07304 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */ 07305 psrlw mm4, mm1 /* shift right each pixel NshiftRight times */ 07306 psrlw mm5, mm1 /* shift right each pixel NshiftRight times */ 07307 movq mm6, [esi] /* load 8 bytes from Src */ 07308 movq mm7, mm6 /* save MM6 in MM7 */ 07309 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07310 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */ 07311 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */ 07312 psrlw mm6, mm1 /* shift right each pixel NshiftRight times */ 07313 psrlw mm7, mm1 /* shift right each pixel NshiftRight times */ 07314 add esi, eax /* move to the next row of Src */ 07315 movq mm2, [esi] /* load 8 bytes from Src */ 07316 movq mm3, mm2 /* save MM2 in MM3 */ 07317 add esi, 2 /* move ESI pointer 2 bytes right */ 07318 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07319 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07320 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07321 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07322 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07323 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07324 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07325 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07326 movq mm2, [esi] /* load 8 bytes from Src */ 07327 movq mm3, mm2 /* save MM2 in MM3 */ 07328 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07329 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07330 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07331 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07332 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07333 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07334 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07335 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07336 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07337 add esi, eax /* move to the next row of Src */ 07338 movq mm2, [esi] /* load 8 bytes from Src */ 07339 movq mm3, mm2 /* save MM2 in MM3 */ 07340 add esi, 2 /* move ESI pointer 2 bytes right */ 07341 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07342 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07343 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07344 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07345 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07346 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07347 movq mm2, [esi] /* load 8 bytes from Src */ 07348 movq mm3, mm2 /* save MM2 in MM3 */ 07349 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07350 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07351 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07352 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07353 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07354 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07355 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07356 /* ---, */ 07357 movq mm2, mm4 /* copy MM4 into MM2 */ 07358 psrlq mm4, 32 /* shift 2 left words to the right */ 07359 psubw mm4, mm2 /* MM4 = MM4 - MM2 */ 07360 movq mm3, mm6 /* copy MM6 into MM3 */ 07361 psrlq mm6, 32 /* shift 2 left words to the right */ 07362 psubw mm6, mm3 /* MM6 = MM6 - MM3 */ 07363 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */ 07364 movq mm2, mm5 /* copy MM6 into MM2 */ 07365 psrlq mm5, 32 /* shift 2 left words to the right */ 07366 psubw mm5, mm2 /* MM5 = MM5 - MM2 */ 07367 movq mm3, mm7 /* copy MM7 into MM3 */ 07368 psrlq mm7, 32 /* shift 2 left words to the right */ 07369 psubw mm7, mm3 /* MM7 = MM7 - MM3 */ 07370 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */ 07371 /* Take abs values of MM4 and MM5 */ 07372 movq mm6, mm4 /* copy MM4 into MM6 */ 07373 movq mm7, mm5 /* copy MM5 into MM7 */ 07374 psraw mm6, 15 /* fill MM6 words with word sign bit */ 07375 psraw mm7, 15 /* fill MM7 words with word sign bit */ 07376 pxor mm4, mm6 /* take 1's compliment of only neg words */ 07377 pxor mm5, mm7 /* take 1's compliment of only neg words */ 07378 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 07379 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */ 07380 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */ 07381 movq [edi], mm4 /* store result in Dest */ 07382 /* ---, */ 07383 sub esi, eax /* move to the current top row in Src */ 07384 sub esi, eax 07385 add esi, 8 /* move Src pointer to the next 8 pixels */ 07386 add edi, 8 /* move Dest pointer to the next 8 pixels */ 07387 /* ---, */ 07388 dec ecx /* decrease loop counter COLUMNS */ 07389 jnz L10412 /* check loop termination, proceed if required */ 07390 mov esi, ebx /* restore most left current row Src address */ 07391 mov edi, edx /* restore most left current row Dest address */ 07392 add esi, eax /* move to the next row in Src */ 07393 add edi, eax /* move to the next row in Dest */ 07394 dec rows /* decrease loop counter ROWS */ 07395 jnz L10410 /* check loop termination, proceed if required */ 07396 /* ---, */ 07397 emms /* exit MMX state */ 07398 popa 07399 } 07400 #else 07401 asm volatile 07402 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 07403 "mov %3, %%eax \n\t" /* load columns into EAX */ 07404 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 07405 "mov %4, %%bl \n\t" /* load NRightShift into BL */ 07406 "movd %%ebx, %%mm1 \n\t" /* copy NRightShift into MM1 */ 07407 /* --- */ 07408 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 07409 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 07410 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 07411 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 07412 /* initialize ROWS counter */ 07413 "subl $2, %2 \n\t" /* do not use first and last rows */ 07414 /* --- */ 07415 ".L10410: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 07416 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */ 07417 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */ 07418 "mov %%edi, %%edx \n\t" /* save EDI in EDX */ 07419 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 07420 ".L10412: \n\t" 07421 /* --- */ 07422 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */ 07423 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */ 07424 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07425 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */ 07426 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */ 07427 "psrlw %%mm1, %%mm4 \n\t" /* shift right each pixel NshiftRight times */ 07428 "psrlw %%mm1, %%mm5 \n\t" /* shift right each pixel NshiftRight times */ 07429 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */ 07430 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */ 07431 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07432 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */ 07433 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */ 07434 "psrlw %%mm1, %%mm6 \n\t" /* shift right each pixel NshiftRight times */ 07435 "psrlw %%mm1, %%mm7 \n\t" /* shift right each pixel NshiftRight times */ 07436 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07437 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07438 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07439 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07440 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07441 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07442 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07443 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07444 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07445 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07446 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07447 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07448 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07449 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07450 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07451 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07452 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07453 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07454 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07455 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07456 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07457 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07458 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07459 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07460 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07461 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07462 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07463 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07464 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07465 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07466 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07467 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07468 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07469 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07470 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07471 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07472 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07473 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07474 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07475 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07476 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07477 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07478 /* --- */ 07479 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */ 07480 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */ 07481 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */ 07482 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */ 07483 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */ 07484 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */ 07485 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */ 07486 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */ 07487 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */ 07488 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */ 07489 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 07490 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 07491 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */ 07492 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */ 07493 /* Take abs values of MM4 and MM5 */ 07494 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */ 07495 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */ 07496 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */ 07497 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */ 07498 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 07499 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */ 07500 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07501 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07502 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */ 07503 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */ 07504 /* --- */ 07505 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */ 07506 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */ 07507 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */ 07508 /* --- */ 07509 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 07510 "jnz .L10412 \n\t" /* check loop termination, proceed if required */ 07511 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */ 07512 "mov %%edx, %%edi \n\t" /* restore most left current row Dest address */ 07513 "add %%eax, %%esi \n\t" /* move to the next row in Src */ 07514 "add %%eax, %%edi \n\t" /* move to the next row in Dest */ 07515 "decl %2 \n\t" /* decrease loop counter ROWS */ 07516 "jnz .L10410 \n\t" /* check loop termination, proceed if required */ 07517 /* --- */ 07518 "emms \n\t" /* exit MMX state */ 07519 "popa \n\t":"=m" (Dest) /* %0 */ 07520 :"m"(Src), /* %1 */ 07521 "m"(rows), /* %2 */ 07522 "m"(columns), /* %3 */ 07523 "m"(NRightShift) /* %4 */ 07524 ); 07525 #endif 07526 #endif 07527 return (0); 07528 } else { 07529 /* No non-MMX implementation yet */ 07530 return (-1); 07531 } 07532 } 07533 07537 void SDL_imageFilterAlignStack(void) 07538 { 07539 #ifdef USE_MMX 07540 #if !defined(GCC__) 07541 __asm 07542 { /* --- stack alignment --- */ 07543 mov ebx, esp /* load ESP into EBX */ 07544 sub ebx, 4 /* reserve space on stack for old value of ESP */ 07545 and ebx, -32 /* align EBX along a 32 byte boundary */ 07546 mov [ebx], esp /* save old value of ESP in stack, behind the bndry */ 07547 mov esp, ebx /* align ESP along a 32 byte boundary */ 07548 } 07549 #else 07550 asm volatile 07551 ( /* --- stack alignment --- */ 07552 "mov %%esp, %%ebx \n\t" /* load ESP into EBX */ 07553 "sub $4, %%ebx \n\t" /* reserve space on stack for old value of ESP */ 07554 "and $-32, %%ebx \n\t" /* align EBX along a 32 byte boundary */ 07555 "mov %%esp, (%%ebx) \n\t" /* save old value of ESP in stack, behind the bndry */ 07556 "mov %%ebx, %%esp \n\t" /* align ESP along a 32 byte boundary */ 07557 ::); 07558 #endif 07559 #endif 07560 } 07561 07565 void SDL_imageFilterRestoreStack(void) 07566 { 07567 #ifdef USE_MMX 07568 #if !defined(GCC__) 07569 __asm 07570 { /* --- restoring old stack --- */ 07571 mov ebx, [esp] /* load old value of ESP */ 07572 mov esp, ebx /* restore old value of ESP */ 07573 } 07574 #else 07575 asm volatile 07576 ( /* --- restoring old stack --- */ 07577 "mov (%%esp), %%ebx \n\t" /* load old value of ESP */ 07578 "mov %%ebx, %%esp \n\t" /* restore old value of ESP */ 07579 ::); 07580 #endif 07581 #endif 07582 }