SDL_gfx  2.0.24
C:/Users/Andreas Schiffler/Desktop/Sources/sdlgfx/SDL_imageFilter.c
Go to the documentation of this file.
00001 /*
00002 
00003 SDL_imageFilter.c: byte-image "filter" routines
00004 
00005 Copyright (C) 2001-2012  Andreas Schiffler
00006 
00007 This software is provided 'as-is', without any express or implied
00008 warranty. In no event will the authors be held liable for any damages
00009 arising from the use of this software.
00010 
00011 Permission is granted to anyone to use this software for any purpose,
00012 including commercial applications, and to alter it and redistribute it
00013 freely, subject to the following restrictions:
00014 
00015    1. The origin of this software must not be misrepresented; you must not
00016    claim that you wrote the original software. If you use this software
00017    in a product, an acknowledgment in the product documentation would be
00018    appreciated but is not required.
00019 
00020    2. Altered source versions must be plainly marked as such, and must not be
00021    misrepresented as being the original software.
00022 
00023    3. This notice may not be removed or altered from any source
00024    distribution.
00025 
00026 Andreas Schiffler -- aschiffler at ferzkopp dot net
00027 
00028 */
00029 
00030 /*
00031 
00032 Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
00033 
00034 Note: Most of the MMX code is based on published routines 
00035 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
00036 him for his work.
00037 
00038 */
00039 
00040 #include <stdio.h>
00041 #include <stdlib.h>
00042 #include <string.h>
00043 
00044 #include "SDL_imageFilter.h"
00045 
00049 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
00050 
00051 /* ------ Static variables ----- */
00052 
00056 static int SDL_imageFilterUseMMX = 1;
00057 
00058 /* Detect GCC */
00059 #if defined(__GNUC__)
00060 #define GCC__
00061 #endif
00062 
00068 unsigned int _cpuFlags()
00069 {
00070         unsigned int flags = 0;
00071 
00072 #ifdef USE_MMX
00073 #if !defined(GCC__)
00074         __asm
00075         {
00076                 pusha
00077                         mov eax, 1
00078                         cpuid   /* get CPU ID flag */
00079                         mov flags,edx   /* move result to mmx_bit */
00080                         popa
00081         }
00082 #else
00083         asm volatile ("pusha                 \n\t" "mov    %1, %%eax     \n\t"  /* request feature flag */
00084                 "cpuid                \n\t"     /* get CPU ID flag */
00085                 "mov    %%edx, %0     \n\t"     /* move result to mmx_bit */
00086                 "popa                \n\t":"=m" (flags) /* %0 */
00087                 :"i"(0x00000001)        /* %1 */
00088                 );
00089 #endif
00090 #endif
00091 
00092         return (flags);
00093 }
00094 
00100 int SDL_imageFilterMMXdetect(void)
00101 {
00102         unsigned int mmx_bit;
00103 
00104         /* Check override flag */
00105         if (SDL_imageFilterUseMMX == 0) {
00106                 return (0);
00107         }
00108 
00109         mmx_bit = _cpuFlags();
00110         mmx_bit &= 0x00800000;
00111         mmx_bit = (mmx_bit && 0x00800000);
00112 
00113         return (int)(mmx_bit);
00114 }
00115 
00119 void SDL_imageFilterMMXoff()
00120 {
00121         SDL_imageFilterUseMMX = 0;
00122 }
00123 
00127 void SDL_imageFilterMMXon()
00128 {
00129         SDL_imageFilterUseMMX = 1;
00130 }
00131 
00132 /* ------------------------------------------------------------------------------------ */
00133 
00144 int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00145 {
00146 #ifdef USE_MMX
00147 #if !defined(GCC__)
00148         __asm
00149         {
00150                 pusha
00151                         mov eax, Src1   /* load Src1 address into eax */
00152                         mov ebx, Src2   /* load Src2 address into ebx */
00153                         mov edi, Dest   /* load Dest address into edi */
00154                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
00155                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
00156                         align 16        /* 16 byte alignment of the loop entry */
00157 L1010:
00158                 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
00159                 paddusb mm1, [ebx]      /* mm1=Src1+Src2 (add 8 bytes with saturation) */
00160                 movq [edi], mm1 /* store result in Dest */
00161                         add eax, 8      /* increase Src1, Src2 and Dest  */
00162                         add ebx, 8      /* register pointers by 8 */
00163                         add edi, 8
00164                         dec ecx /* decrease loop counter */
00165                         jnz L1010       /* check loop termination, proceed if required */
00166                         emms /* exit MMX state */
00167                         popa
00168         }
00169 #else
00170         asm volatile
00171                 ("pusha              \n\t" "mov          %2, %%eax \n\t"        /* load Src1 address into eax */
00172                 "mov          %1, %%ebx \n\t"   /* load Src2 address into ebx */
00173                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
00174                 "mov          %3, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
00175                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
00176                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
00177                 "1: movq (%%eax), %%mm1 \n\t"           /* load 8 bytes from Src1 into mm1 */
00178                 "paddusb (%%ebx), %%mm1 \n\t"   /* mm1=Src1+Src2 (add 8 bytes with saturation) */
00179                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
00180                 "add          $8, %%eax \n\t"   /* increase Src1, Src2 and Dest  */
00181                 "add          $8, %%ebx \n\t"   /* register pointers by 8 */
00182                 "add          $8, %%edi \n\t" "dec              %%ecx \n\t"     /* decrease loop counter */
00183                 "jnz             1b     \n\t"     /* check loop termination, proceed if required */
00184                 "emms                   \n\t"   /* exit MMX state */
00185                 "popa                   \n\t":"=m" (Dest)       /* %0 */
00186                 :"m"(Src2),             /* %1 */
00187                 "m"(Src1),              /* %2 */
00188                 "m"(SrcLength)          /* %3 */
00189                 );
00190 #endif
00191         return (0);
00192 #else
00193         return (-1);
00194 #endif
00195 }
00196 
00207 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00208 {
00209         unsigned int i, istart;
00210         unsigned char *cursrc1, *cursrc2, *curdst;
00211         int result;
00212 
00213         /* Validate input parameters */
00214         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00215                 return(-1);
00216         if (length == 0)
00217                 return(0);
00218 
00219         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00220 
00221                 /* Use MMX assembly routine */
00222                 SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
00223 
00224                 /* Check for unaligned bytes */
00225                 if ((length & 7) > 0) {
00226                         /* Setup to process unaligned bytes */
00227                         istart = length & 0xfffffff8;
00228                         cursrc1 = &Src1[istart];
00229                         cursrc2 = &Src2[istart];
00230                         curdst = &Dest[istart];
00231                 } else {
00232                         /* No unaligned bytes - we are done */
00233                         return (0);
00234                 }
00235         } else {
00236                 /* Setup to process whole image */
00237                 istart = 0;
00238                 cursrc1 = Src1;
00239                 cursrc2 = Src2;
00240                 curdst = Dest;
00241         }
00242 
00243         /* C routine to process image */
00244         for (i = istart; i < length; i++) {
00245                 result = (int) *cursrc1 + (int) *cursrc2;
00246                 if (result > 255)
00247                         result = 255;
00248                 *curdst = (unsigned char) result;
00249                 /* Advance pointers */
00250                 cursrc1++;
00251                 cursrc2++;
00252                 curdst++;
00253         }
00254 
00255         return (0);
00256 }
00257 
00269 int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
00270                                                    unsigned char *Mask)
00271 {
00272 #ifdef USE_MMX
00273 #if !defined(GCC__)
00274         __asm
00275         { 
00276                 pusha
00277                         mov edx, Mask /* load Mask address into edx */
00278                         movq mm0, [edx] /* load Mask into mm0 */
00279                 mov eax, Src1 /* load Src1 address into eax */
00280                         mov ebx, Src2 /* load Src2 address into ebx */
00281                         mov edi, Dest /* load Dest address into edi */
00282                         mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
00283                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
00284                         align 16        /* 16 byte alignment of the loop entry */
00285 L21011:
00286                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00287                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */
00288                 /* --- Byte shift via Word shift --- */
00289                 psrlw mm1, 1    /* shift 4 WORDS of mm1 1 bit to the right */
00290                         psrlw mm2, 1    /* shift 4 WORDS of mm2 1 bit to the right */
00291                         pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
00292                         /* byte     0x0f, 0xdb, 0xc8 */
00293                         pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
00294                         /* byte     0x0f, 0xdb, 0xd0 */
00295                         paddusb mm1,  mm2       /* mm1=mm1+mm2 (add 8 bytes with saturation) */
00296                         movq [edi],  mm1        /* store result in Dest */
00297                         add eax,  8     /* increase Src1, Src2 and Dest  */
00298                         add ebx,  8     /* register pointers by 8 */
00299                         add edi,  8
00300                         dec ecx         /* decrease loop counter */
00301                         jnz L21011      /* check loop termination, proceed if required */
00302                         emms    /* exit MMX state */
00303                         popa
00304         }
00305 #else
00306         asm volatile
00307                 ("pusha              \n\t" "movl         %4, %%edx \n\t"        /* load Mask address into edx */
00308                 "movq    (%%edx), %%mm0 \n\t"   /* load Mask into mm0 */
00309                 "mov          %2, %%eax \n\t"   /* load Src1 address into eax */
00310                 "mov          %1, %%ebx \n\t"   /* load Src2 address into ebx */
00311                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
00312                 "mov          %3, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
00313                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
00314                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
00315                 "1:                      \n\t"
00316                 "movq    (%%eax), %%mm1 \n\t"   /* load 8 bytes from Src1 into mm1 */
00317                 "movq    (%%ebx), %%mm2 \n\t"   /* load 8 bytes from Src2 into mm2 */
00318                 /* --- Byte shift via Word shift --- */
00319                 "psrlw        $1, %%mm1 \n\t"   /* shift 4 WORDS of mm1 1 bit to the right */
00320                 "psrlw        $1, %%mm2 \n\t"   /* shift 4 WORDS of mm2 1 bit to the right */
00321                 /*      "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of mm1 */
00322                 ".byte     0x0f, 0xdb, 0xc8 \n\t"
00323                 /*      "pand      %%mm0, %%mm2 \n\t"    // apply Mask to 8 BYTES of mm2 */
00324                 ".byte     0x0f, 0xdb, 0xd0 \n\t" 
00325                 "paddusb   %%mm2, %%mm1 \n\t"   /* mm1=mm1+mm2 (add 8 bytes with saturation) */
00326                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
00327                 "add          $8, %%eax \n\t"   /* increase Src1, Src2 and Dest  */
00328                 "add          $8, %%ebx \n\t"   /* register pointers by 8 */
00329                 "add          $8, %%edi \n\t" 
00330                 "dec              %%ecx \n\t"   /* decrease loop counter */
00331                 "jnz                 1b \n\t"     /* check loop termination, proceed if required */
00332                 "emms                   \n\t"   /* exit MMX state */
00333                 "popa                   \n\t":"=m" (Dest)       /* %0 */
00334                 :"m"(Src2),             /* %1 */
00335                 "m"(Src1),              /* %2 */
00336                 "m"(SrcLength),         /* %3 */
00337                 "m"(Mask)                       /* %4 */
00338                 );
00339 #endif
00340         return (0);
00341 #else
00342         return (-1);
00343 #endif
00344 }
00345 
00356 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00357 {
00358         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
00359         unsigned int i, istart;
00360         unsigned char *cursrc1, *cursrc2, *curdst;
00361         int result;
00362 
00363         /* Validate input parameters */
00364         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00365                 return(-1);
00366         if (length == 0)
00367                 return(0);
00368 
00369         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00370                 /* MMX routine */
00371                 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
00372 
00373                 /* Check for unaligned bytes */
00374                 if ((length & 7) > 0) {
00375                         /* Setup to process unaligned bytes */
00376                         istart = length & 0xfffffff8;
00377                         cursrc1 = &Src1[istart];
00378                         cursrc2 = &Src2[istart];
00379                         curdst = &Dest[istart];
00380                 } else {
00381                         /* No unaligned bytes - we are done */
00382                         return (0);
00383                 }
00384         } else {
00385                 /* Setup to process whole image */
00386                 istart = 0;
00387                 cursrc1 = Src1;
00388                 cursrc2 = Src2;
00389                 curdst = Dest;
00390         }
00391 
00392         /* C routine to process image */
00393         for (i = istart; i < length; i++) {
00394                 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
00395                 *curdst = (unsigned char) result;
00396                 /* Advance pointers */
00397                 cursrc1++;
00398                 cursrc2++;
00399                 curdst++;
00400         }
00401 
00402         return (0);
00403 }
00404 
00415 int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00416 {
00417 #ifdef USE_MMX
00418 #if !defined(GCC__)
00419         __asm
00420         {
00421                 pusha
00422                         mov eax,  Src1  /* load Src1 address into eax */
00423                         mov ebx,  Src2  /* load Src2 address into ebx */
00424                         mov edi,  Dest  /* load Dest address into edi */
00425                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
00426                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00427                         align 16 /* 16 byte alignment of the loop entry */
00428 L1012:
00429                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00430                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00431                 movq [edi],  mm1        /* store result in Dest */
00432                         add eax, 8      /* increase Src1, Src2 and Dest  */
00433                         add ebx, 8      /* register pointers by 8 */
00434                         add edi, 8
00435                         dec ecx /* decrease loop counter */
00436                         jnz L1012       /* check loop termination, proceed if required */
00437                         emms /* exit MMX state */
00438                         popa
00439         }
00440 #else
00441         asm volatile
00442                 ("pusha              \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
00443                 "mov %1, %%ebx \n\t"    /* load Src2 address into ebx */
00444                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
00445                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
00446                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
00447                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
00448                 "1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
00449                 "psubusb (%%ebx), %%mm1 \n\t"   /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00450                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
00451                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
00452                 "add $8, %%ebx \n\t"    /* register pointers by 8 */
00453                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
00454                 "jnz 1b         \n\t"     /* check loop termination, proceed if required */
00455                 "emms          \n\t"    /* exit MMX state */
00456                 "popa                   \n\t":"=m" (Dest)       /* %0 */
00457                 :"m"(Src2),             /* %1 */
00458                 "m"(Src1),              /* %2 */
00459                 "m"(SrcLength)          /* %3 */
00460                 );
00461 #endif
00462         return (0);
00463 #else
00464         return (-1);
00465 #endif
00466 }
00467 
00478 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00479 {
00480         unsigned int i, istart;
00481         unsigned char *cursrc1, *cursrc2, *curdst;
00482         int result;
00483 
00484         /* Validate input parameters */
00485         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00486                 return(-1);
00487         if (length == 0)
00488                 return(0);
00489 
00490         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00491                 /* MMX routine */
00492                 SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
00493 
00494                 /* Check for unaligned bytes */
00495                 if ((length & 7) > 0) {
00496                         /* Setup to process unaligned bytes */
00497                         istart = length & 0xfffffff8;
00498                         cursrc1 = &Src1[istart];
00499                         cursrc2 = &Src2[istart];
00500                         curdst = &Dest[istart];
00501                 } else {
00502                         /* No unaligned bytes - we are done */
00503                         return (0);
00504                 }
00505         } else {
00506                 /* Setup to process whole image */
00507                 istart = 0;
00508                 cursrc1 = Src1;
00509                 cursrc2 = Src2;
00510                 curdst = Dest;
00511         }
00512 
00513         /* C routine to process image */
00514         for (i = istart; i < length; i++) {
00515                 result = (int) *cursrc1 - (int) *cursrc2;
00516                 if (result < 0)
00517                         result = 0;
00518                 *curdst = (unsigned char) result;
00519                 /* Advance pointers */
00520                 cursrc1++;
00521                 cursrc2++;
00522                 curdst++;
00523         }
00524 
00525         return (0);
00526 }
00527 
00538 int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00539 {
00540 #ifdef USE_MMX
00541 #if !defined(GCC__)
00542         __asm
00543         {
00544                 pusha
00545                         mov eax, Src1   /* load Src1 address into eax */
00546                         mov ebx, Src2   /* load Src2 address into ebx */
00547                         mov edi, Dest   /* load Dest address into edi */
00548                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
00549                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00550                         align 16        /* 16 byte alignment of the loop entry */
00551 L1013:
00552                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00553                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */
00554                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00555                 psubusb mm2,  [eax]     /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
00556                 por mm1,  mm2   /* combine both mm2 and mm1 results */
00557                         movq [edi],  mm1        /* store result in Dest */
00558                         add eax, 8      /* increase Src1, Src2 and Dest  */
00559                         add ebx, 8      /* register pointers by 8 */
00560                         add edi, 8
00561                         dec ecx         /* decrease loop counter */
00562                         jnz L1013       /* check loop termination, proceed if required */
00563                         emms         /* exit MMX state */
00564                         popa
00565         }
00566 #else
00567         asm volatile
00568                 ("pusha              \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
00569                 "mov %1, %%ebx \n\t"    /* load Src2 address into ebx */
00570                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
00571                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
00572                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
00573                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
00574                 "1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
00575                 "movq    (%%ebx), %%mm2 \n\t"   /* load 8 bytes from Src2 into mm2 */
00576                 "psubusb (%%ebx), %%mm1 \n\t"   /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00577                 "psubusb (%%eax), %%mm2 \n\t"   /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
00578                 "por       %%mm2, %%mm1 \n\t"   /* combine both mm2 and mm1 results */
00579                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
00580                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
00581                 "add $8, %%ebx \n\t"    /* register pointers by 8 */
00582                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
00583                 "jnz 1b        \n\t"      /* check loop termination, proceed if required */
00584                 "emms          \n\t"    /* exit MMX state */
00585                 "popa                   \n\t":"=m" (Dest)       /* %0 */
00586                 :"m"(Src2),             /* %1 */
00587                 "m"(Src1),              /* %2 */
00588                 "m"(SrcLength)          /* %3 */
00589                 );
00590 #endif
00591         return (0);
00592 #else
00593         return (-1);
00594 #endif
00595 }
00596 
00607 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00608 {
00609         unsigned int i, istart;
00610         unsigned char *cursrc1, *cursrc2, *curdst;
00611         int result;
00612 
00613         /* Validate input parameters */
00614         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00615                 return(-1);
00616         if (length == 0)
00617                 return(0);
00618 
00619         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00620                 /* MMX routine */
00621                 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
00622 
00623                 /* Check for unaligned bytes */
00624                 if ((length & 7) > 0) {
00625                         /* Setup to process unaligned bytes */
00626                         istart = length & 0xfffffff8;
00627                         cursrc1 = &Src1[istart];
00628                         cursrc2 = &Src2[istart];
00629                         curdst = &Dest[istart];
00630                 } else {
00631                         /* No unaligned bytes - we are done */
00632                         return (0);
00633                 }
00634         } else {
00635                 /* Setup to process whole image */
00636                 istart = 0;
00637                 cursrc1 = Src1;
00638                 cursrc2 = Src2;
00639                 curdst = Dest;
00640         }
00641 
00642         /* C routine to process image */
00643         for (i = istart; i < length; i++) {
00644                 result = abs((int) *cursrc1 - (int) *cursrc2);
00645                 *curdst = (unsigned char) result;
00646                 /* Advance pointers */
00647                 cursrc1++;
00648                 cursrc2++;
00649                 curdst++;
00650         }
00651 
00652         return (0);
00653 }
00654 
00665 int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00666 {
00667 #ifdef USE_MMX
00668 #if !defined(GCC__)
00669         __asm
00670         {
00671                 pusha
00672                         mov eax, Src1   /* load Src1 address into eax */
00673                         mov ebx, Src2   /* load Src2 address into ebx */
00674                         mov edi, Dest   /* load Dest address into edi */
00675                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
00676                         shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
00677                         pxor mm0, mm0   /* zero mm0 register */
00678                         align 16        /* 16 byte alignment of the loop entry */
00679 L1014:
00680                 movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
00681                 movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
00682                 movq mm2, mm1   /* copy mm1 into mm2 */
00683                         movq mm4, mm3   /* copy mm3 into mm4  */
00684                         punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
00685                         punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
00686                         punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
00687                         punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
00688                         pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
00689                         pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
00690                         /* Take abs value of the results (signed words) */
00691                         movq mm5, mm1   /* copy mm1 into mm5 */
00692                         movq mm6, mm2   /* copy mm2 into mm6 */
00693                         psraw mm5, 15   /* fill mm5 words with word sign bit */
00694                         psraw mm6, 15   /* fill mm6 words with word sign bit */
00695                         pxor mm1, mm5   /* take 1's compliment of only neg. words */
00696                         pxor mm2, mm6   /* take 1's compliment of only neg. words */
00697                         psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
00698                         psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
00699                         packuswb mm1, mm2   /* pack words back into bytes with saturation */
00700                         movq [edi], mm1   /* store result in Dest */
00701                         add eax, 8   /* increase Src1, Src2 and Dest  */
00702                         add ebx, 8   /* register pointers by 8 */
00703                         add edi, 8
00704                         dec ecx         /* decrease loop counter */
00705                         jnz L1014       /* check loop termination, proceed if required */
00706                         emms /* exit MMX state */
00707                         popa
00708         }
00709 #else
00710         asm volatile
00711                 ("pusha              \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
00712                 "mov %1, %%ebx \n\t"    /* load Src2 address into ebx */
00713                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
00714                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
00715                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
00716                 "pxor      %%mm0, %%mm0 \n\t"   /* zero mm0 register */
00717                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
00718                 "1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
00719                 "movq    (%%ebx), %%mm3 \n\t"   /* load 8 bytes from Src2 into mm3 */
00720                 "movq      %%mm1, %%mm2 \n\t"   /* copy mm1 into mm2 */
00721                 "movq      %%mm3, %%mm4 \n\t"   /* copy mm3 into mm4  */
00722                 "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack low  bytes of Src1 into words */
00723                 "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack high bytes of Src1 into words */
00724                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of Src2 into words */
00725                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of Src2 into words */
00726                 "pmullw    %%mm3, %%mm1 \n\t"   /* mul low  bytes of Src1 and Src2  */
00727                 "pmullw    %%mm4, %%mm2 \n\t"   /* mul high bytes of Src1 and Src2 */
00728                 /* Take abs value of the results (signed words) */
00729                 "movq      %%mm1, %%mm5 \n\t"   /* copy mm1 into mm5 */
00730                 "movq      %%mm2, %%mm6 \n\t"   /* copy mm2 into mm6 */
00731                 "psraw       $15, %%mm5 \n\t"   /* fill mm5 words with word sign bit */
00732                 "psraw       $15, %%mm6 \n\t"   /* fill mm6 words with word sign bit */
00733                 "pxor      %%mm5, %%mm1 \n\t"   /* take 1's compliment of only neg. words */
00734                 "pxor      %%mm6, %%mm2 \n\t"   /* take 1's compliment of only neg. words */
00735                 "psubsw    %%mm5, %%mm1 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
00736                 "psubsw    %%mm6, %%mm2 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
00737                 "packuswb  %%mm2, %%mm1 \n\t"   /* pack words back into bytes with saturation */
00738                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
00739                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
00740                 "add $8, %%ebx \n\t"    /* register pointers by 8 */
00741                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
00742                 "jnz 1b        \n\t"      /* check loop termination, proceed if required */
00743                 "emms          \n\t"    /* exit MMX state */
00744                 "popa \n\t":"=m" (Dest) /* %0 */
00745                 :"m"(Src2),             /* %1 */
00746                 "m"(Src1),              /* %2 */
00747                 "m"(SrcLength)          /* %3 */
00748                 );
00749 #endif
00750         return (0);
00751 #else
00752         return (-1);
00753 #endif
00754 }
00755 
00766 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00767 {
00768         unsigned int i, istart;
00769         unsigned char *cursrc1, *cursrc2, *curdst;
00770         int result;
00771 
00772         /* Validate input parameters */
00773         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00774                 return(-1);
00775         if (length == 0)
00776                 return(0);
00777 
00778         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00779                 /* MMX routine */
00780                 SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
00781 
00782                 /* Check for unaligned bytes */
00783                 if ((length & 7) > 0) {
00784                         /* Setup to process unaligned bytes */
00785                         istart = length & 0xfffffff8;
00786                         cursrc1 = &Src1[istart];
00787                         cursrc2 = &Src2[istart];
00788                         curdst = &Dest[istart];
00789                 } else {
00790                         /* No unaligned bytes - we are done */
00791                         return (0);
00792                 }
00793         } else {
00794                 /* Setup to process whole image */
00795                 istart = 0;
00796                 cursrc1 = Src1;
00797                 cursrc2 = Src2;
00798                 curdst = Dest;
00799         }
00800 
00801         /* C routine to process image */
00802         for (i = istart; i < length; i++) {
00803 
00804                 /* NOTE: this is probably wrong - dunno what the MMX code does */
00805 
00806                 result = (int) *cursrc1 * (int) *cursrc2;
00807                 if (result > 255)
00808                         result = 255;
00809                 *curdst = (unsigned char) result;
00810                 /* Advance pointers */
00811                 cursrc1++;
00812                 cursrc2++;
00813                 curdst++;
00814         }
00815 
00816         return (0);
00817 }
00818 
00829 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00830 {
00831 #ifdef USE_MMX
00832 #if !defined(GCC__)
00833         __asm
00834         {
00835                 pusha
00836                         mov edx, Src1   /* load Src1 address into edx */
00837                         mov esi, Src2   /* load Src2 address into esi */
00838                         mov edi, Dest   /* load Dest address into edi */
00839                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
00840                         align 16        /* 16 byte alignment of the loop entry */
00841 L10141:
00842                 mov al, [edx]   /* load a byte from Src1 */
00843                 mul [esi]       /* mul with a byte from Src2 */
00844                 mov [edi], al   /* move a byte result to Dest */
00845                         inc edx         /* increment Src1, Src2, Dest */
00846                         inc esi                 /* pointer registers by one */
00847                         inc edi
00848                         dec ecx /* decrease loop counter */
00849                         jnz L10141      /* check loop termination, proceed if required */
00850                         popa
00851         }
00852 #else
00853         asm volatile
00854                 ("pusha              \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */
00855                 "mov %1, %%esi \n\t"    /* load Src2 address into esi */
00856                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
00857                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
00858                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
00859                 "1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
00860                 "mulb (%%esi)       \n\t"       /* mul with a byte from Src2 */
00861                 "mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
00862                 "inc %%edx \n\t"                /* increment Src1, Src2, Dest */
00863                 "inc %%esi \n\t"                /* pointer registers by one */
00864                 "inc %%edi \n\t" "dec %%ecx      \n\t"  /* decrease loop counter */
00865                 "jnz 1b         \n\t"     /* check loop termination, proceed if required */
00866                 "popa                   \n\t":"=m" (Dest)       /* %0 */
00867                 :"m"(Src2),             /* %1 */
00868                 "m"(Src1),              /* %2 */
00869                 "m"(SrcLength)          /* %3 */
00870                 );
00871 #endif
00872         return (0);
00873 #else
00874         return (-1);
00875 #endif
00876 }
00877 
00888 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00889 {
00890         unsigned int i, istart;
00891         unsigned char *cursrc1, *cursrc2, *curdst;
00892         int result;
00893 
00894         /* Validate input parameters */
00895         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00896                 return(-1);
00897         if (length == 0)
00898                 return(0);
00899 
00900         if (SDL_imageFilterMMXdetect()) {
00901                 if (length > 0) {
00902                         /* ASM routine */
00903                         SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
00904 
00905                         /* Check for unaligned bytes */
00906                         if ((length & 7) > 0) {
00907                                 /* Setup to process unaligned bytes */
00908                                 istart = length & 0xfffffff8;
00909                                 cursrc1 = &Src1[istart];
00910                                 cursrc2 = &Src2[istart];
00911                                 curdst = &Dest[istart];
00912                         } else {
00913                                 /* No unaligned bytes - we are done */
00914                                 return (0);
00915                         }
00916                 } else {
00917                         /* No bytes - we are done */
00918                         return (0);
00919                 }
00920         } else {
00921                 /* Setup to process whole image */
00922                 istart = 0;
00923                 cursrc1 = Src1;
00924                 cursrc2 = Src2;
00925                 curdst = Dest;
00926         }
00927 
00928         /* C routine to process image */
00929         for (i = istart; i < length; i++) {
00930                 result = (int) *cursrc1 * (int) *cursrc2;
00931                 *curdst = (unsigned char) result;
00932                 /* Advance pointers */
00933                 cursrc1++;
00934                 cursrc2++;
00935                 curdst++;
00936         }
00937 
00938         return (0);
00939 }
00940 
00951 int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00952 {
00953 #ifdef USE_MMX
00954 #if !defined(GCC__)
00955         __asm
00956         { 
00957                 pusha
00958                         mov eax, Src1           /* load Src1 address into eax */
00959                         mov ebx, Src2           /* load Src2 address into ebx */
00960                         mov edi, Dest           /* load Dest address into edi */
00961                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
00962                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00963                         pxor mm0,  mm0  /* zero mm0 register */
00964                         align 16                /* 16 byte alignment of the loop entry */
00965 L1015:
00966                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00967                 movq mm3,  [ebx]        /* load 8 bytes from Src2 into mm3 */
00968                 movq mm2,  mm1  /* copy mm1 into mm2 */
00969                         movq mm4,  mm3  /* copy mm3 into mm4  */
00970                         punpcklbw mm1,  mm0     /* unpack low  bytes of Src1 into words */
00971                         punpckhbw mm2,  mm0     /* unpack high bytes of Src1 into words */
00972                         punpcklbw mm3,  mm0     /* unpack low  bytes of Src2 into words */
00973                         punpckhbw mm4,  mm0     /* unpack high bytes of Src2 into words */
00974                         psrlw mm1,  1   /* divide mm1 words by 2, Src1 low bytes */
00975                         psrlw mm2,  1   /* divide mm2 words by 2, Src1 high bytes */
00976                         pmullw mm1,  mm3        /* mul low  bytes of Src1 and Src2  */
00977                         pmullw mm2,  mm4        /* mul high bytes of Src1 and Src2 */
00978                         packuswb mm1,  mm2      /* pack words back into bytes with saturation */
00979                         movq [edi],  mm1        /* store result in Dest */
00980                         add eax,  8     /* increase Src1, Src2 and Dest  */
00981                         add ebx,  8     /* register pointers by 8 */
00982                         add edi,  8
00983                         dec ecx         /* decrease loop counter */
00984                         jnz L1015               /* check loop termination, proceed if required */
00985                         emms                    /* exit MMX state */
00986                         popa
00987         }
00988 #else
00989         asm volatile
00990                 ("pusha \n\t" "mov %2, %%eax \n\t"      /* load Src1 address into eax */
00991                 "mov %1, %%ebx \n\t"    /* load Src2 address into ebx */
00992                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
00993                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
00994                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
00995                 "pxor      %%mm0, %%mm0 \n\t"   /* zero mm0 register */
00996                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
00997                 "1: movq (%%eax), %%mm1 \n\t"   /* load 8 bytes from Src1 into mm1 */
00998                 "movq    (%%ebx), %%mm3 \n\t"   /* load 8 bytes from Src2 into mm3 */
00999                 "movq      %%mm1, %%mm2 \n\t"   /* copy mm1 into mm2 */
01000                 "movq      %%mm3, %%mm4 \n\t"   /* copy mm3 into mm4  */
01001                 "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack low  bytes of Src1 into words */
01002                 "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack high bytes of Src1 into words */
01003                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of Src2 into words */
01004                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of Src2 into words */
01005                 "psrlw        $1, %%mm1 \n\t"   /* divide mm1 words by 2, Src1 low bytes */
01006                 "psrlw        $1, %%mm2 \n\t"   /* divide mm2 words by 2, Src1 high bytes */
01007                 "pmullw    %%mm3, %%mm1 \n\t"   /* mul low  bytes of Src1 and Src2  */
01008                 "pmullw    %%mm4, %%mm2 \n\t"   /* mul high bytes of Src1 and Src2 */
01009                 "packuswb  %%mm2, %%mm1 \n\t"   /* pack words back into bytes with saturation */
01010                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
01011                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
01012                 "add $8, %%ebx \n\t"    /* register pointers by 8 */
01013                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
01014                 "jnz 1b        \n\t"    /* check loop termination, proceed if required */
01015                 "emms          \n\t"    /* exit MMX state */
01016                 "popa \n\t":"=m" (Dest) /* %0 */
01017                 :"m"(Src2),             /* %1 */
01018                 "m"(Src1),              /* %2 */
01019                 "m"(SrcLength)          /* %3 */
01020                 );
01021 #endif
01022         return (0);
01023 #else
01024         return (-1);
01025 #endif
01026 }
01027 
01038 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01039 {
01040         unsigned int i, istart;
01041         unsigned char *cursrc1, *cursrc2, *curdst;
01042         int result;
01043 
01044         /* Validate input parameters */
01045         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01046                 return(-1);
01047         if (length == 0)
01048                 return(0);
01049 
01050         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01051                 /* MMX routine */
01052                 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
01053 
01054                 /* Check for unaligned bytes */
01055                 if ((length & 7) > 0) {
01056                         /* Setup to process unaligned bytes */
01057                         istart = length & 0xfffffff8;
01058                         cursrc1 = &Src1[istart];
01059                         cursrc2 = &Src2[istart];
01060                         curdst = &Dest[istart];
01061                 } else {
01062                         /* No unaligned bytes - we are done */
01063                         return (0);
01064                 }
01065         } else {
01066                 /* Setup to process whole image */
01067                 istart = 0;
01068                 cursrc1 = Src1;
01069                 cursrc2 = Src2;
01070                 curdst = Dest;
01071         }
01072 
01073         /* C routine to process image */
01074         for (i = istart; i < length; i++) {
01075                 result = ((int) *cursrc1 / 2) * (int) *cursrc2;
01076                 if (result > 255)
01077                         result = 255;
01078                 *curdst = (unsigned char) result;
01079                 /* Advance pointers */
01080                 cursrc1++;
01081                 cursrc2++;
01082                 curdst++;
01083         }
01084 
01085         return (0);
01086 }
01087 
01098 int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01099 {
01100 #ifdef USE_MMX
01101 #if !defined(GCC__)
01102         __asm
01103         {
01104                 pusha
01105                         mov eax, Src1           /* load Src1 address into eax */
01106                         mov ebx, Src2           /* load Src2 address into ebx */
01107                         mov edi, Dest           /* load Dest address into edi */
01108                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01109                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01110                         pxor mm0, mm0           /* zero mm0 register */
01111                         align 16                /* 16 byte alignment of the loop entry */
01112 L1016:
01113                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01114                 movq mm3, [ebx]         /* load 8 bytes from Src2 into mm3 */
01115                 movq mm2, mm1           /* copy mm1 into mm2 */
01116                         movq mm4, mm3           /* copy mm3 into mm4  */
01117                         punpcklbw mm1, mm0      /* unpack low  bytes of Src1 into words */
01118                         punpckhbw mm2, mm0      /* unpack high bytes of Src1 into words */
01119                         punpcklbw mm3, mm0      /* unpack low  bytes of Src2 into words */
01120                         punpckhbw mm4, mm0      /* unpack high bytes of Src2 into words */
01121                         psrlw mm1, 1    /* divide mm1 words by 2, Src1 low bytes */
01122                         psrlw mm2, 1    /* divide mm2 words by 2, Src1 high bytes */
01123                         psrlw mm3, 1    /* divide mm3 words by 2, Src2 low bytes */
01124                         psrlw mm4, 1    /* divide mm4 words by 2, Src2 high bytes */
01125                         pmullw mm1, mm3         /* mul low  bytes of Src1 and Src2  */
01126                         pmullw mm2, mm4         /* mul high bytes of Src1 and Src2 */
01127                         packuswb mm1, mm2       /* pack words back into bytes with saturation */
01128                         movq [edi], mm1         /* store result in Dest */
01129                         add eax, 8      /* increase Src1, Src2 and Dest  */
01130                         add ebx, 8      /* register pointers by 8 */
01131                         add edi,  8
01132                         dec ecx         /* decrease loop counter */
01133                         jnz L1016               /* check loop termination, proceed if required */
01134                         emms                    /* exit MMX state */
01135                         popa
01136         }
01137 #else
01138         asm volatile
01139                 ("pusha              \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
01140                 "mov %1, %%ebx \n\t"    /* load Src2 address into ebx */
01141                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
01142                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
01143                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
01144                 "pxor      %%mm0, %%mm0 \n\t"   /* zero mm0 register */
01145                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
01146                 "1: movq (%%eax), %%mm1 \n\t"   /* load 8 bytes from Src1 into mm1 */
01147                 "movq    (%%ebx), %%mm3 \n\t"   /* load 8 bytes from Src2 into mm3 */
01148                 "movq      %%mm1, %%mm2 \n\t"   /* copy mm1 into mm2 */
01149                 "movq      %%mm3, %%mm4 \n\t"   /* copy mm3 into mm4  */
01150                 "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack low  bytes of Src1 into words */
01151                 "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack high bytes of Src1 into words */
01152                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of Src2 into words */
01153                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of Src2 into words */
01154                 "psrlw        $1, %%mm1 \n\t"   /* divide mm1 words by 2, Src1 low bytes */
01155                 "psrlw        $1, %%mm2 \n\t"   /* divide mm2 words by 2, Src1 high bytes */
01156                 "psrlw        $1, %%mm3 \n\t"   /* divide mm3 words by 2, Src2 low bytes */
01157                 "psrlw        $1, %%mm4 \n\t"   /* divide mm4 words by 2, Src2 high bytes */
01158                 "pmullw    %%mm3, %%mm1 \n\t"   /* mul low  bytes of Src1 and Src2  */
01159                 "pmullw    %%mm4, %%mm2 \n\t"   /* mul high bytes of Src1 and Src2 */
01160                 "packuswb  %%mm2, %%mm1 \n\t"   /* pack words back into bytes with saturation */
01161                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
01162                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
01163                 "add $8, %%ebx \n\t"    /* register pointers by 8 */
01164                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
01165                 "jnz 1b        \n\t"    /* check loop termination, proceed if required */
01166                 "emms          \n\t"    /* exit MMX state */
01167                 "popa                   \n\t":"=m" (Dest)       /* %0 */
01168                 :"m"(Src2),             /* %1 */
01169                 "m"(Src1),              /* %2 */
01170                 "m"(SrcLength)          /* %3 */
01171                 );
01172 #endif
01173         return (0);
01174 #else
01175         return (-1);
01176 #endif
01177 }
01178 
01189 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01190 {
01191         unsigned int i, istart;
01192         unsigned char *cursrc1, *cursrc2, *curdst;
01193         int result;
01194 
01195         /* Validate input parameters */
01196         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01197                 return(-1);
01198         if (length == 0)
01199                 return(0);
01200 
01201         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01202                 /* MMX routine */
01203                 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
01204 
01205                 /* Check for unaligned bytes */
01206                 if ((length & 7) > 0) {
01207                         /* Setup to process unaligned bytes */
01208                         istart = length & 0xfffffff8;
01209                         cursrc1 = &Src1[istart];
01210                         cursrc2 = &Src2[istart];
01211                         curdst = &Dest[istart];
01212                 } else {
01213                         /* No unaligned bytes - we are done */
01214                         return (0);
01215                 }
01216         } else {
01217                 /* Setup to process whole image */
01218                 istart = 0;
01219                 cursrc1 = Src1;
01220                 cursrc2 = Src2;
01221                 curdst = Dest;
01222         }
01223 
01224         /* C routine to process image */
01225         for (i = istart; i < length; i++) {
01226                 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
01227                 if (result > 255)
01228                         result = 255;
01229                 *curdst = (unsigned char) result;
01230                 /* Advance pointers */
01231                 cursrc1++;
01232                 cursrc2++;
01233                 curdst++;
01234         }
01235 
01236         return (0);
01237 }
01238 
01249 int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01250 {
01251 #ifdef USE_MMX
01252 #if !defined(GCC__)
01253         __asm
01254         {
01255                 pusha
01256                         mov eax, Src1           /* load Src1 address into eax */
01257                         mov ebx, Src2           /* load Src2 address into ebx */
01258                         mov edi, Dest           /* load Dest address into edi */
01259                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01260                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
01261                         align 16                /* 16 byte alignment of the loop entry */
01262 L1017:
01263                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01264                 pand mm1, [ebx]         /* mm1=Src1&Src2 */
01265                 movq [edi], mm1         /* store result in Dest */
01266                         add eax, 8      /* increase Src1, Src2 and Dest  */
01267                         add ebx, 8      /* register pointers by 8 */
01268                         add edi, 8
01269                         dec ecx         /* decrease loop counter */
01270                         jnz L1017               /* check loop termination, proceed if required */
01271                         emms                    /* exit MMX state */
01272                         popa
01273         }
01274 #else
01275         asm volatile
01276                 ("pusha              \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
01277                 "mov %1, %%ebx \n\t"    /* load Src2 address into ebx */
01278                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
01279                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
01280                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
01281                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
01282                 "1: movq (%%eax), %%mm1 \n\t"   /* load 8 bytes from Src1 into mm1 */
01283                 "pand    (%%ebx), %%mm1 \n\t"   /* mm1=Src1&Src2 */
01284                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
01285                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
01286                 "add $8, %%ebx \n\t"    /* register pointers by 8 */
01287                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
01288                 "jnz 1b        \n\t"    /* check loop termination, proceed if required */
01289                 "emms          \n\t"    /* exit MMX state */
01290                 "popa                   \n\t":"=m" (Dest)       /* %0 */
01291                 :"m"(Src2),             /* %1 */
01292                 "m"(Src1),              /* %2 */
01293                 "m"(SrcLength)          /* %3 */
01294                 );
01295 #endif
01296         return (0);
01297 #else
01298         return (-1);
01299 #endif
01300 }
01301 
01312 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01313 {
01314         unsigned int i, istart;
01315         unsigned char *cursrc1, *cursrc2, *curdst;
01316 
01317         /* Validate input parameters */
01318         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01319                 return(-1);
01320         if (length == 0)
01321                 return(0);
01322 
01323         if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
01324                 /*  if (length > 7) { */
01325                 /* Call MMX routine */
01326 
01327                 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
01328 
01329                 /* Check for unaligned bytes */
01330                 if ((length & 7) > 0) {
01331 
01332                         /* Setup to process unaligned bytes */
01333                         istart = length & 0xfffffff8;
01334                         cursrc1 = &Src1[istart];
01335                         cursrc2 = &Src2[istart];
01336                         curdst = &Dest[istart];
01337                 } else {
01338                         /* No unaligned bytes - we are done */
01339                         return (0);
01340                 }
01341         } else {
01342                 /* Setup to process whole image */
01343                 istart = 0;
01344                 cursrc1 = Src1;
01345                 cursrc2 = Src2;
01346                 curdst = Dest;
01347         }
01348 
01349         /* C routine to process image */
01350         for (i = istart; i < length; i++) {
01351                 *curdst = (*cursrc1) & (*cursrc2);
01352                 /* Advance pointers */
01353                 cursrc1++;
01354                 cursrc2++;
01355                 curdst++;
01356         }
01357 
01358         return (0);
01359 }
01360 
01371 int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01372 {
01373 #ifdef USE_MMX
01374 #if !defined(GCC__)
01375         __asm
01376         {
01377                 pusha
01378                         mov eax, Src1           /* load Src1 address into eax */
01379                         mov ebx, Src2           /* load Src2 address into ebx */
01380                         mov edi, Dest           /* load Dest address into edi */
01381                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01382                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01383                         align 16                /* 16 byte alignment of the loop entry */
01384 L91017:
01385                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01386                 por mm1, [ebx]          /* mm1=Src1|Src2 */
01387                 movq [edi], mm1         /* store result in Dest */
01388                         add eax, 8      /* increase Src1, Src2 and Dest  */
01389                         add ebx, 8      /* register pointers by 8 */
01390                         add edi,  8
01391                         dec ecx         /* decrease loop counter */
01392                         jnz L91017              /* check loop termination, proceed if required */
01393                         emms                    /* exit MMX state */
01394                         popa
01395         }
01396 #else
01397         asm volatile
01398                 ("pusha              \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
01399                 "mov %1, %%ebx \n\t"    /* load Src2 address into ebx */
01400                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
01401                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
01402                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
01403                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
01404                 "1: movq (%%eax), %%mm1 \n\t"   /* load 8 bytes from Src1 into mm1 */
01405                 "por     (%%ebx), %%mm1 \n\t"   /* mm1=Src1|Src2 */
01406                 "movq    %%mm1, (%%edi) \n\t"   /* store result in Dest */
01407                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
01408                 "add $8, %%ebx \n\t"    /* register pointers by 8 */
01409                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
01410                 "jnz 1b        \n\t"    /* check loop termination, proceed if required */
01411                 "emms          \n\t"    /* exit MMX state */
01412                 "popa                   \n\t":"=m" (Dest)       /* %0 */
01413                 :"m"(Src2),             /* %1 */
01414                 "m"(Src1),              /* %2 */
01415                 "m"(SrcLength)          /* %3 */
01416                 );
01417 #endif
01418         return (0);
01419 #else
01420         return (-1);
01421 #endif
01422 }
01423 
01434 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01435 {
01436         unsigned int i, istart;
01437         unsigned char *cursrc1, *cursrc2, *curdst;
01438 
01439         /* Validate input parameters */
01440         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01441                 return(-1);
01442         if (length == 0)
01443                 return(0);
01444 
01445         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01446 
01447                 /* MMX routine */
01448                 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
01449 
01450                 /* Check for unaligned bytes */
01451                 if ((length & 7) > 0) {
01452                         /* Setup to process unaligned bytes */
01453                         istart = length & 0xfffffff8;
01454                         cursrc1 = &Src1[istart];
01455                         cursrc2 = &Src2[istart];
01456                         curdst = &Dest[istart];
01457                 } else {
01458                         /* No unaligned bytes - we are done */
01459                         return (0);
01460                 }
01461         } else {
01462                 /* Setup to process whole image */
01463                 istart = 0;
01464                 cursrc1 = Src1;
01465                 cursrc2 = Src2;
01466                 curdst = Dest;
01467         }
01468 
01469         /* C routine to process image */
01470         for (i = istart; i < length; i++) {
01471                 *curdst = *cursrc1 | *cursrc2;
01472                 /* Advance pointers */
01473                 cursrc1++;
01474                 cursrc2++;
01475                 curdst++;
01476         }
01477         return (0);
01478 }
01479 
01490 int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01491 {
01492 #ifdef USE_MMX
01493 #if !defined(GCC__)
01494         __asm
01495         {
01496                 pusha
01497                         mov edx, Src1           /* load Src1 address into edx */
01498                         mov esi, Src2           /* load Src2 address into esi */
01499                         mov edi, Dest           /* load Dest address into edi */
01500                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01501                         align 16                /* 16 byte alignment of the loop entry */
01502 L10191:
01503                 mov bl, [esi]           /* load a byte from Src2 */
01504                 cmp bl, 0       /* check if it zero */
01505                         jnz L10192
01506                         mov [edi], 255          /* division by zero = 255 !!! */
01507                         jmp  L10193
01508 L10192:
01509                 xor ah, ah      /* prepare AX, zero AH register */
01510                         mov al, [edx]           /* load a byte from Src1 into AL */
01511                 div   bl                /* divide AL by BL */
01512                         mov [edi], al           /* move a byte result to Dest */
01513 L10193:
01514                 inc edx         /* increment Src1, Src2, Dest */
01515                         inc esi                 /* pointer registers by one */
01516                         inc edi
01517                         dec ecx         /* decrease loop counter */
01518                         jnz L10191      /* check loop termination, proceed if required */
01519                         popa
01520         }
01521 #else
01522         asm volatile
01523                 ("pusha \n\t" "mov %2, %%edx \n\t"      /* load Src1 address into edx */
01524                 "mov %1, %%esi \n\t"    /* load Src2 address into esi */
01525                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
01526                 "mov %3, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
01527                 ".align 16     \n\t"    /* 16 byte alignment of the loop entry */
01528                 "1: mov (%%esi), %%bl  \n\t"    /* load a byte from Src2 */
01529                 "cmp       $0, %%bl  \n\t"      /* check if it zero */
01530                 "jnz 2f              \n\t" "movb  $255, (%%edi) \n\t"   /* division by zero = 255 !!! */
01531                 "jmp 3f              \n\t" "2:                  \n\t" "xor   %%ah, %%ah    \n\t"        /* prepare AX, zero AH register */
01532                 "mov   (%%edx), %%al \n\t"      /* load a byte from Src1 into AL */
01533                 "div   %%bl          \n\t"      /* divide AL by BL */
01534                 "mov   %%al, (%%edi) \n\t"      /* move a byte result to Dest */
01535                 "3: inc %%edx        \n\t"      /* increment Src1, Src2, Dest */
01536                 "inc %%esi \n\t"                /* pointer registers by one */
01537                 "inc %%edi \n\t" "dec %%ecx    \n\t"    /* decrease loop counter */
01538                 "jnz 1b       \n\t"     /* check loop termination, proceed if required */
01539                 "popa \n\t":"=m" (Dest) /* %0 */
01540                 :"m"(Src2),             /* %1 */
01541                 "m"(Src1),              /* %2 */
01542                 "m"(SrcLength)          /* %3 */
01543                 );
01544 #endif
01545         return (0);
01546 #else
01547         return (-1);
01548 #endif
01549 }
01550 
01561 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01562 {
01563         unsigned int i, istart;
01564         unsigned char *cursrc1, *cursrc2, *curdst;
01565         int result;
01566 
01567         /* Validate input parameters */
01568         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01569                 return(-1);
01570         if (length == 0)
01571                 return(0);
01572 
01573         if (SDL_imageFilterMMXdetect()) {
01574                 if (length > 0) {
01575                         /* Call ASM routine */
01576                         SDL_imageFilterDivASM(Src1, Src2, Dest, length);
01577 
01578                         /* Never unaligned bytes - we are done */
01579                         return (0);
01580                 } else {
01581                         return (-1);
01582                 }
01583         } 
01584         
01585         /* Setup to process whole image */
01586         istart = 0;
01587         cursrc1 = Src1;
01588         cursrc2 = Src2;
01589         curdst = Dest;
01590 
01591         /* C routine to process image */
01592         for (i = istart; i < length; i++) {
01593                 result = (int) *cursrc1 / (int) *cursrc2;
01594                 *curdst = (unsigned char) result;
01595                 /* Advance pointers */
01596                 cursrc1++;
01597                 cursrc2++;
01598                 curdst++;
01599         }
01600 
01601         return (0);
01602 }
01603 
01604 /* ------------------------------------------------------------------------------------ */
01605 
01615 int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
01616 {
01617 #ifdef USE_MMX
01618 #if !defined(GCC__)
01619         __asm
01620         {
01621                 pusha
01622                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
01623                         mov eax, Src1           /* load Src1 address into eax */
01624                         mov edi, Dest           /* load Dest address into edi */
01625                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01626                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01627                         align 16                /* 16 byte alignment of the loop entry */
01628 L91117:
01629                 movq mm0, [eax]         /* load 8 bytes from Src1 into mm1 */
01630                 pxor mm0, mm1           /* negate mm0 by xoring with mm1 */
01631                         movq [edi], mm0         /* store result in Dest */
01632                         add eax, 8      /* increase Src1, Src2 and Dest  */
01633                         add edi,  8
01634                         dec ecx         /* decrease loop counter */
01635                         jnz L91117              /* check loop termination, proceed if required */
01636                         emms                    /* exit MMX state */
01637                         popa
01638         }
01639 #else
01640         asm volatile
01641                 ("pusha              \n\t" "pcmpeqb   %%mm1, %%mm1 \n\t"        /* generate all 1's in mm1 */
01642                 "mov %1, %%eax \n\t"    /* load Src1 address into eax */
01643                 "mov %0, %%edi \n\t"    /* load Dest address into edi */
01644                 "mov %2, %%ecx \n\t"    /* load loop counter (SIZE) into ecx */
01645                 "shr $3, %%ecx \n\t"    /* counter/8 (MMX loads 8 bytes at a time) */
01646                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
01647                 "1: movq (%%eax), %%mm0 \n\t"   /* load 8 bytes from Src1 into mm1 */
01648                 "pxor      %%mm1, %%mm0 \n\t"   /* negate mm0 by xoring with mm1 */
01649                 "movq    %%mm0, (%%edi) \n\t"   /* store result in Dest */
01650                 "add $8, %%eax \n\t"    /* increase Src1, Src2 and Dest  */
01651                 "add $8, %%edi \n\t" "dec %%ecx     \n\t"       /* decrease loop counter */
01652                 "jnz 1b        \n\t"    /* check loop termination, proceed if required */
01653                 "emms          \n\t"    /* exit MMX state */
01654                 "popa                   \n\t":"=m" (Dest)       /* %0 */
01655                 :"m"(Src1),             /* %1 */
01656                 "m"(SrcLength)          /* %2 */
01657                 );
01658 #endif
01659         return (0);
01660 #else
01661         return (-1);
01662 #endif
01663 }
01664 
01674 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
01675 {
01676         unsigned int i, istart;
01677         unsigned char *cursrc1, *curdst;
01678 
01679         /* Validate input parameters */
01680         if ((Src1 == NULL) || (Dest == NULL))
01681                 return(-1);
01682         if (length == 0)
01683                 return(0);
01684 
01685         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01686                 /* MMX routine */
01687                 SDL_imageFilterBitNegationMMX(Src1, Dest, length);
01688 
01689                 /* Check for unaligned bytes */
01690                 if ((length & 7) > 0) {
01691                         /* Setup to process unaligned bytes */
01692                         istart = length & 0xfffffff8;
01693                         cursrc1 = &Src1[istart];
01694                         curdst = &Dest[istart];
01695                 } else {
01696                         /* No unaligned bytes - we are done */
01697                         return (0);
01698                 }
01699         } else {
01700                 /* Setup to process whole image */
01701                 istart = 0;
01702                 cursrc1 = Src1;
01703                 curdst = Dest;
01704         }
01705 
01706         /* C routine to process image */
01707         for (i = istart; i < length; i++) {
01708                 *curdst = ~(*cursrc1);
01709                 /* Advance pointers */
01710                 cursrc1++;
01711                 curdst++;
01712         }
01713 
01714         return (0);
01715 }
01716 
01727 int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
01728 {
01729 #ifdef USE_MMX
01730 #if !defined(GCC__)
01731         __asm
01732         {
01733                 pusha
01734                         /* ** Duplicate C in 8 bytes of MM1 ** */
01735                         mov al, C       /* load C into AL */
01736                         mov ah, al      /* copy AL into AH */
01737                         mov bx, ax      /* copy AX into BX */
01738                         shl eax, 16     /* shift 2 bytes of EAX left */
01739                         mov ax, bx      /* copy BX into AX */
01740                         movd mm1, eax           /* copy EAX into MM1 */
01741                         movd mm2, eax           /* copy EAX into MM2 */
01742                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
01743                         mov eax, Src1           /* load Src1 address into eax */
01744                         mov edi, Dest           /* load Dest address into edi */
01745                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01746                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01747                         align 16                        /* 16 byte alignment of the loop entry */
01748 L1021:
01749                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
01750                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */
01751                         movq [edi], mm0         /* store result in Dest */
01752                         add eax, 8      /* increase Dest register pointer by 8 */
01753                         add edi, 8      /* increase Dest register pointer by 8 */
01754                         dec              ecx            /* decrease loop counter */
01755                         jnz             L1021           /* check loop termination, proceed if required */
01756                         emms                            /* exit MMX state */
01757                         popa
01758         }
01759 #else
01760         asm volatile
01761                 ("pusha              \n\t"
01762                 /* ** Duplicate C in 8 bytes of MM1 ** */
01763                 "mov           %3, %%al \n\t"   /* load C into AL */
01764                 "mov         %%al, %%ah \n\t"   /* copy AL into AH */
01765                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
01766                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
01767                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
01768                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
01769                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
01770                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher bytes of MM1 with C */
01771                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
01772                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
01773                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
01774                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
01775                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
01776                 "1:                     \n\t" 
01777                 "movq    (%%eax), %%mm0 \n\t"   /* load 8 bytes from Src1 into MM0 */
01778                 "paddusb   %%mm1, %%mm0 \n\t"   /* MM0=SrcDest+C (add 8 bytes with saturation) */
01779                 "movq    %%mm0, (%%edi) \n\t"   /* store result in Dest */
01780                 "add          $8, %%eax \n\t"   /* increase Dest register pointer by 8 */
01781                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
01782                 "dec              %%ecx \n\t"   /* decrease loop counter */
01783                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
01784                 "emms                   \n\t"   /* exit MMX state */
01785                 "popa                   \n\t":"=m" (Dest)       /* %0 */
01786                 :"m"(Src1),             /* %1 */
01787                 "m"(SrcLength),         /* %2 */
01788                 "m"(C)                  /* %3 */
01789                 );
01790 #endif
01791         return (0);
01792 #else
01793         return (-1);
01794 #endif
01795 }
01796 
01808 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
01809 {
01810         unsigned int i, istart;
01811         int iC;
01812         unsigned char *cursrc1, *curdest;
01813         int result;
01814 
01815         /* Validate input parameters */
01816         if ((Src1 == NULL) || (Dest == NULL))
01817                 return(-1);
01818         if (length == 0)
01819                 return(0);
01820 
01821         /* Special case: C==0 */
01822         if (C == 0) {
01823                 memcpy(Src1, Dest, length);
01824                 return (0); 
01825         }
01826 
01827         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01828 
01829                 /* MMX routine */
01830                 SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
01831 
01832                 /* Check for unaligned bytes */
01833                 if ((length & 7) > 0) {
01834                         /* Setup to process unaligned bytes */
01835                         istart = length & 0xfffffff8;
01836                         cursrc1 = &Src1[istart];
01837                         curdest = &Dest[istart];
01838                 } else {
01839                         /* No unaligned bytes - we are done */
01840                         return (0);
01841                 }
01842         } else {
01843                 /* Setup to process whole image */
01844                 istart = 0;
01845                 cursrc1 = Src1;
01846                 curdest = Dest;
01847         }
01848 
01849         /* C routine to process image */
01850         iC = (int) C;
01851         for (i = istart; i < length; i++) {
01852                 result = (int) *cursrc1 + iC;
01853                 if (result > 255)
01854                         result = 255;
01855                 *curdest = (unsigned char) result;
01856                 /* Advance pointers */
01857                 cursrc1++;
01858                 curdest++;
01859         }
01860         return (0);
01861 }
01862 
01874 int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
01875 {
01876 #ifdef USE_MMX
01877 #if !defined(GCC__)
01878         __asm
01879         {
01880                 pusha
01881                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */
01882                         mov eax, C      /* load C into EAX */
01883                         movd mm1, eax           /* copy EAX into MM1 */
01884                         mov eax, D      /* load D into EAX */
01885                         movd mm2, eax           /* copy EAX into MM2 */
01886                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
01887                         mov eax, Src1           /* load Src1 address into eax */
01888                         mov edi, Dest           /* load Dest address into edi */
01889                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01890                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01891                         align 16                        /* 16 byte alignment of the loop entry */
01892 L11023:
01893                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
01894                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */
01895                         movq [edi],  mm0        /* store result in SrcDest */
01896                         add eax, 8      /* increase Src1 register pointer by 8 */
01897                         add edi, 8      /* increase Dest register pointer by 8 */
01898                         dec              ecx            /* decrease loop counter */
01899                         jnz             L11023          /* check loop termination, proceed if required */
01900                         emms                            /* exit MMX state */
01901                         popa
01902         }
01903 #else
01904         asm volatile
01905                 ("pusha              \n\t"
01906                 /* ** Duplicate (int)C in 8 bytes of MM1 ** */
01907                 "mov          %3, %%eax \n\t"   /* load C into EAX */
01908                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
01909                 "mov          %4, %%eax \n\t"   /* load D into EAX */
01910                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
01911                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher bytes of MM1 with C */
01912                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
01913                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
01914                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
01915                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
01916                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
01917                 "1:                     \n\t" 
01918                 "movq    (%%eax), %%mm0 \n\t"   /* load 8 bytes from SrcDest into MM0 */
01919                 "paddusb   %%mm1, %%mm0 \n\t"   /* MM0=SrcDest+C (add 8 bytes with saturation) */
01920                 "movq    %%mm0, (%%edi) \n\t"   /* store result in SrcDest */
01921                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
01922                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
01923                 "dec              %%ecx \n\t"   /* decrease loop counter */
01924                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
01925                 "emms                   \n\t"   /* exit MMX state */
01926                 "popa                   \n\t":"=m" (Dest)       /* %0 */
01927                 :"m"(Src1),             /* %1 */
01928                 "m"(SrcLength),         /* %2 */
01929                 "m"(C),                 /* %3 */
01930                 "m"(D)                  /* %4 */
01931                 );
01932 #endif
01933         return (0);
01934 #else
01935         return (-1);
01936 #endif
01937 }
01938 
01949 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
01950 {
01951         unsigned int i, j, istart, D;
01952         int iC[4];
01953         unsigned char *cursrc1;
01954         unsigned char *curdest;
01955         int result;
01956 
01957         /* Validate input parameters */
01958         if ((Src1 == NULL) || (Dest == NULL))
01959                 return(-1);
01960         if (length == 0)
01961                 return(0);
01962 
01963         /* Special case: C==0 */
01964         if (C == 0) {
01965                 memcpy(Src1, Dest, length);
01966                 return (0); 
01967         }
01968 
01969         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01970 
01971                 /* MMX routine */
01972                 D=SWAP_32(C);
01973                 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
01974 
01975                 /* Check for unaligned bytes */
01976                 if ((length & 7) > 0) {
01977                         /* Setup to process unaligned bytes */
01978                         istart = length & 0xfffffff8;
01979                         cursrc1 = &Src1[istart];
01980                         curdest = &Dest[istart];
01981                 } else {
01982                         /* No unaligned bytes - we are done */
01983                         return (0);
01984                 }
01985         } else {
01986                 /* Setup to process whole image */
01987                 istart = 0;
01988                 cursrc1 = Src1;
01989                 curdest = Dest;
01990         }
01991 
01992         /* C routine to process bytes */
01993         iC[3] = (int) ((C >> 24) & 0xff);
01994         iC[2] = (int) ((C >> 16) & 0xff);
01995         iC[1] = (int) ((C >>  8) & 0xff);
01996         iC[0] = (int) ((C >>  0) & 0xff);
01997         for (i = istart; i < length; i += 4) {
01998                 for (j = 0; j < 4; j++) {
01999                         if ((i+j)<length) {
02000                                 result = (int) *cursrc1 + iC[j];
02001                                 if (result > 255) result = 255;
02002                                 *curdest = (unsigned char) result;
02003                                 /* Advance pointers */
02004                                 cursrc1++;
02005                                 curdest++;
02006                         }
02007                 }
02008         }
02009         return (0);
02010 }
02011 
02023 int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
02024                                                                         unsigned char *Mask)
02025 {
02026 #ifdef USE_MMX
02027 #if !defined(GCC__)
02028         __asm
02029         {
02030                 pusha
02031                         /* ** Duplicate C in 8 bytes of MM1 ** */
02032                         mov al, C       /* load C into AL */
02033                         mov ah, al      /* copy AL into AH */
02034                         mov bx, ax      /* copy AX into BX */
02035                         shl eax, 16     /* shift 2 bytes of EAX left */
02036                         mov ax, bx      /* copy BX into AX */
02037                         movd mm1, eax           /* copy EAX into MM1 */
02038                         movd mm2, eax           /* copy EAX into MM2 */
02039                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02040                         mov edx, Mask           /* load Mask address into edx */
02041                         movq mm0, [edx]         /* load Mask into mm0 */
02042                 mov eax, Src1           /* load Src1 address into eax */
02043                         mov edi, Dest           /* load Dest address into edi */
02044                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02045                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02046                         align 16                        /* 16 byte alignment of the loop entry */
02047 L1022:
02048                 movq mm2, [eax]         /* load 8 bytes from Src1 into MM2 */
02049                 psrlw mm2, 1    /* shift 4 WORDS of MM2 1 bit to the right */
02050                         pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
02051                         /* byte     0x0f, 0xdb, 0xd0 */
02052                         paddusb mm2,  mm1       /* MM2=SrcDest+C (add 8 bytes with saturation) */
02053                         movq [edi], mm2         /* store result in Dest */
02054                         add eax, 8      /* increase Src1 register pointer by 8 */
02055                         add edi, 8      /* increase Dest register pointer by 8 */
02056                         dec              ecx            /* decrease loop counter */
02057                         jnz             L1022           /* check loop termination, proceed if required */
02058                         emms                            /* exit MMX state */
02059                         popa
02060         }
02061 #else
02062         asm volatile
02063                 ("pusha              \n\t"
02064                 /* ** Duplicate C in 8 bytes of MM1 ** */
02065                 "mov           %3, %%al \n\t"   /* load C into AL */
02066                 "mov         %%al, %%ah \n\t"   /* copy AL into AH */
02067                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
02068                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
02069                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
02070                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
02071                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
02072                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher bytes of MM1 with C */
02073                 "movl         %4, %%edx \n\t"   /* load Mask address into edx */
02074                 "movq    (%%edx), %%mm0 \n\t"   /* load Mask into mm0 */
02075                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
02076                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
02077                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
02078                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
02079                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
02080                 "1:                     \n\t" 
02081                 "movq    (%%eax), %%mm2 \n\t"   /* load 8 bytes from Src1 into MM2 */
02082                 "psrlw        $1, %%mm2 \n\t"   /* shift 4 WORDS of MM2 1 bit to the right */
02083                 /*    "pand      %%mm0, %%mm2 \n\t"    // apply Mask to 8 BYTES of MM2 */
02084                 ".byte     0x0f, 0xdb, 0xd0 \n\t" 
02085                 "paddusb   %%mm1, %%mm2 \n\t"   /* MM2=SrcDest+C (add 8 bytes with saturation) */
02086                 "movq    %%mm2, (%%edi) \n\t"   /* store result in Dest */
02087                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
02088                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
02089                 "dec              %%ecx \n\t"   /* decrease loop counter */
02090                 "jnz                  1b \n\t"  /* check loop termination, proceed if required */
02091                 "emms                   \n\t"   /* exit MMX state */
02092                 "popa                   \n\t":"=m" (Dest)       /* %0 */
02093                 :"m"(Src1),             /* %1 */
02094                 "m"(SrcLength),         /* %2 */
02095                 "m"(C),                 /* %3 */
02096                 "m"(Mask)                       /* %4 */
02097                 );
02098 #endif
02099         return (0);
02100 #else
02101         return (-1);
02102 #endif
02103 }
02104 
02115 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02116 {
02117         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
02118         unsigned int i, istart;
02119         int iC;
02120         unsigned char *cursrc1;
02121         unsigned char *curdest;
02122         int result;
02123 
02124         /* Validate input parameters */
02125         if ((Src1 == NULL) || (Dest == NULL))
02126                 return(-1);
02127         if (length == 0)
02128                 return(0);
02129 
02130         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02131 
02132                 /* MMX routine */
02133                 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
02134 
02135                 /* Check for unaligned bytes */
02136                 if ((length & 7) > 0) {
02137                         /* Setup to process unaligned bytes */
02138                         istart = length & 0xfffffff8;
02139                         cursrc1 = &Src1[istart];
02140                         curdest = &Dest[istart];
02141                 } else {
02142                         /* No unaligned bytes - we are done */
02143                         return (0);
02144                 }
02145         } else {
02146                 /* Setup to process whole image */
02147                 istart = 0;
02148                 cursrc1 = Src1;
02149                 curdest = Dest;
02150         }
02151 
02152         /* C routine to process image */
02153         iC = (int) C;
02154         for (i = istart; i < length; i++) {
02155                 result = (int) (*cursrc1 / 2) + iC;
02156                 if (result > 255)
02157                         result = 255;
02158                 *curdest = (unsigned char) result;
02159                 /* Advance pointers */
02160                 cursrc1++;
02161                 curdest++;
02162         }
02163 
02164         return (0);
02165 }
02166 
02177 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
02178 {
02179 #ifdef USE_MMX
02180 #if !defined(GCC__)
02181         __asm
02182         {
02183                 pusha
02184                         /* ** Duplicate C in 8 bytes of MM1 ** */
02185                         mov al, C       /* load C into AL */
02186                         mov ah, al      /* copy AL into AH */
02187                         mov bx, ax      /* copy AX into BX */
02188                         shl eax, 16     /* shift 2 bytes of EAX left */
02189                         mov ax, bx      /* copy BX into AX */
02190                         movd mm1, eax           /* copy EAX into MM1 */
02191                         movd mm2, eax           /* copy EAX into MM2 */
02192                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02193                         mov eax, Src1           /* load Src1 address into eax */
02194                         mov edi, Dest           /* load Dest address into edi */
02195                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02196                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02197                         align 16                        /* 16 byte alignment of the loop entry */
02198 L1023:
02199                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02200                 psubusb mm0,  mm1       /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02201                         movq [edi], mm0         /* store result in SrcDest */
02202                         add eax, 8      /* increase Src1 register pointer by 8 */
02203                         add edi, 8      /* increase Dest register pointer by 8 */
02204                         dec              ecx            /* decrease loop counter */
02205                         jnz             L1023           /* check loop termination, proceed if required */
02206                         emms                            /* exit MMX state */
02207                         popa
02208         }
02209 #else
02210         asm volatile
02211                 ("pusha              \n\t"
02212                 /* ** Duplicate C in 8 bytes of MM1 ** */
02213                 "mov           %3, %%al \n\t"   /* load C into AL */
02214                 "mov         %%al, %%ah \n\t"   /* copy AL into AH */
02215                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
02216                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
02217                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
02218                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
02219                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
02220                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher bytes of MM1 with C */
02221                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
02222                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
02223                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
02224                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
02225                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
02226                 "1: movq (%%eax), %%mm0 \n\t"   /* load 8 bytes from SrcDest into MM0 */
02227                 "psubusb   %%mm1, %%mm0 \n\t"   /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02228                 "movq    %%mm0, (%%edi) \n\t"   /* store result in SrcDest */
02229                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
02230                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
02231                 "dec              %%ecx \n\t"   /* decrease loop counter */
02232                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
02233                 "emms                   \n\t"   /* exit MMX state */
02234                 "popa                   \n\t":"=m" (Dest)       /* %0 */
02235                 :"m"(Src1),             /* %1 */
02236                 "m"(SrcLength),         /* %2 */
02237                 "m"(C)                  /* %3 */
02238                 );
02239 #endif
02240         return (0);
02241 #else
02242         return (-1);
02243 #endif
02244 }
02245 
02256 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02257 {
02258         unsigned int i, istart;
02259         int iC;
02260         unsigned char *cursrc1;
02261         unsigned char *curdest;
02262         int result;
02263 
02264         /* Validate input parameters */
02265         if ((Src1 == NULL) || (Dest == NULL))
02266                 return(-1);
02267         if (length == 0)
02268                 return(0);
02269 
02270         /* Special case: C==0 */
02271         if (C == 0) {
02272                 memcpy(Src1, Dest, length);
02273                 return (0); 
02274         }
02275 
02276         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02277 
02278                 /* MMX routine */
02279                 SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
02280 
02281                 /* Check for unaligned bytes */
02282                 if ((length & 7) > 0) {
02283                         /* Setup to process unaligned bytes */
02284                         istart = length & 0xfffffff8;
02285                         cursrc1 = &Src1[istart];
02286                         curdest = &Dest[istart];
02287                 } else {
02288                         /* No unaligned bytes - we are done */
02289                         return (0);
02290                 }
02291         } else {
02292                 /* Setup to process whole image */
02293                 istart = 0;
02294                 cursrc1 = Src1;
02295                 curdest = Dest;
02296         }
02297 
02298         /* C routine to process image */
02299         iC = (int) C;
02300         for (i = istart; i < length; i++) {
02301                 result = (int) *cursrc1 - iC;
02302                 if (result < 0)
02303                         result = 0;
02304                 *curdest = (unsigned char) result;
02305                 /* Advance pointers */
02306                 cursrc1++;
02307                 curdest++;
02308         }
02309         return (0);
02310 }
02311 
02323 int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
02324 {
02325 #ifdef USE_MMX
02326 #if !defined(GCC__)
02327         __asm
02328         {
02329                 pusha
02330                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */
02331                         mov eax, C      /* load C into EAX */
02332                         movd mm1, eax           /* copy EAX into MM1 */
02333                         mov eax, D      /* load D into EAX */
02334                         movd mm2, eax           /* copy EAX into MM2 */
02335                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02336                         mov eax, Src1           /* load Src1 address into eax */
02337                         mov edi, Dest           /* load Dest address into edi */
02338                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02339                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02340                         align 16                        /* 16 byte alignment of the loop entry */
02341 L11024:
02342                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02343                 psubusb mm0, mm1        /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02344                         movq [edi], mm0         /* store result in SrcDest */
02345                         add eax, 8      /* increase Src1 register pointer by 8 */
02346                         add edi, 8      /* increase Dest register pointer by 8 */
02347                         dec              ecx            /* decrease loop counter */
02348                         jnz             L11024          /* check loop termination, proceed if required */
02349                         emms                            /* exit MMX state */
02350                         popa
02351         }
02352 #else
02353         asm volatile
02354                 ("pusha              \n\t"
02355                 /* ** Duplicate (int)C in 8 bytes of MM1 ** */
02356                 "mov          %3, %%eax \n\t"   /* load C into EAX */
02357                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
02358                 "mov          %4, %%eax \n\t"   /* load D into EAX */
02359                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
02360                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher bytes of MM1 with C */
02361                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
02362                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
02363                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
02364                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
02365                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
02366                 "1: movq (%%eax), %%mm0 \n\t"   /* load 8 bytes from SrcDest into MM0 */
02367                 "psubusb   %%mm1, %%mm0 \n\t"   /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02368                 "movq    %%mm0, (%%edi) \n\t"   /* store result in SrcDest */
02369                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
02370                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
02371                 "dec              %%ecx \n\t"   /* decrease loop counter */
02372                 "jnz                  1b \n\t"  /* check loop termination, proceed if required */
02373                 "emms                   \n\t"   /* exit MMX state */
02374                 "popa                   \n\t":"=m" (Dest)       /* %0 */
02375                 :"m"(Src1),             /* %1 */
02376                 "m"(SrcLength),         /* %2 */
02377                 "m"(C),                 /* %3 */
02378                 "m"(D)                  /* %4 */
02379                 );
02380 #endif
02381         return (0);
02382 #else
02383         return (-1);
02384 #endif
02385 }
02386 
02397 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
02398 {
02399         unsigned int i, j, istart, D;
02400         int iC[4];
02401         unsigned char *cursrc1;
02402         unsigned char *curdest;
02403         int result;
02404 
02405         /* Validate input parameters */
02406         if ((Src1 == NULL) || (Dest == NULL))
02407                 return(-1);
02408         if (length == 0)
02409                 return(0);
02410 
02411     /* Special case: C==0 */
02412         if (C == 0) {
02413                 memcpy(Src1, Dest, length);
02414                 return (0); 
02415         }
02416 
02417         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02418 
02419                 /* MMX routine */
02420                 D=SWAP_32(C);
02421                 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
02422 
02423                 /* Check for unaligned bytes */
02424                 if ((length & 7) > 0) {
02425                         /* Setup to process unaligned bytes */
02426                         istart = length & 0xfffffff8;
02427                         cursrc1 = &Src1[istart];
02428                         curdest = &Dest[istart];
02429                 } else {
02430                         /* No unaligned bytes - we are done */
02431                         return (0);
02432                 }
02433         } else {
02434                 /* Setup to process whole image */
02435                 istart = 0;
02436                 cursrc1 = Src1;
02437                 curdest = Dest;
02438         }
02439 
02440         /* C routine to process image */
02441         iC[3] = (int) ((C >> 24) & 0xff);
02442         iC[2] = (int) ((C >> 16) & 0xff);
02443         iC[1] = (int) ((C >>  8) & 0xff);
02444         iC[0] = (int) ((C >>  0) & 0xff);
02445         for (i = istart; i < length; i += 4) {
02446                 for (j = 0; j < 4; j++) {
02447                         if ((i+j)<length) {
02448                                 result = (int) *cursrc1 - iC[j];
02449                                 if (result < 0) result = 0;
02450                                 *curdest = (unsigned char) result;
02451                                 /* Advance pointers */
02452                                 cursrc1++;
02453                                 curdest++;
02454                         }
02455                 }
02456         }
02457         return (0);
02458 }
02459 
02471 int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
02472                                                                  unsigned char *Mask)
02473 {
02474 #ifdef USE_MMX
02475 #if !defined(GCC__)
02476         __asm
02477         {
02478                 pusha
02479                         mov edx, Mask           /* load Mask address into edx */
02480                         movq mm0, [edx]         /* load Mask into mm0 */
02481                 xor ecx, ecx    /* zero ECX */
02482                         mov cl,  N      /* load loop counter (N) into CL */
02483                         movd mm3,  ecx  /* copy (N) into MM3  */
02484                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
02485 L10240:                         /* ** Prepare proper bit-Mask in MM1 ** */
02486                 psrlw mm1,  1   /* shift 4 WORDS of MM1 1 bit to the right */
02487                         pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
02488                         /*  byte     0x0f, 0xdb, 0xc8 */
02489                         dec               cl            /* decrease loop counter */
02490                         jnz            L10240           /* check loop termination, proceed if required */
02491                         /* ** Shift all bytes of the image ** */
02492                         mov eax, Src1           /* load Src1 address into eax */
02493                         mov edi, Dest           /* load Dest address into edi */
02494                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02495                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02496                         align 16                        /* 16 byte alignment of the loop entry */
02497 L10241:
02498                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02499                 psrlw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the right */
02500                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
02501                         /* byte     0x0f, 0xdb, 0xc1 */
02502                         movq [edi], mm0         /* store result in SrcDest */
02503                         add eax, 8      /* increase Src1 register pointer by 8 */
02504                         add edi, 8      /* increase Dest register pointer by 8 */
02505                         dec              ecx            /* decrease loop counter */
02506                         jnz            L10241           /* check loop termination, proceed if required */
02507                         emms                            /* exit MMX state */
02508                         popa
02509         }
02510 #else
02511         asm volatile
02512                 ("pusha              \n\t" "movl         %4, %%edx \n\t"        /* load Mask address into edx */
02513                 "movq    (%%edx), %%mm0 \n\t"   /* load Mask into mm0 */
02514                 "xor       %%ecx, %%ecx \n\t"   /* zero ECX */
02515                 "mov           %3, %%cl \n\t"   /* load loop counter (N) into CL */
02516                 "movd      %%ecx, %%mm3 \n\t"   /* copy (N) into MM3  */
02517                 "pcmpeqb   %%mm1, %%mm1 \n\t"   /* generate all 1's in mm1 */
02518                 "1:                     \n\t"   /* ** Prepare proper bit-Mask in MM1 ** */
02519                 "psrlw        $1, %%mm1 \n\t"   /* shift 4 WORDS of MM1 1 bit to the right */
02520                 /*    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1 */
02521                 ".byte     0x0f, 0xdb, 0xc8 \n\t" 
02522                 "dec               %%cl \n\t"   /* decrease loop counter */
02523                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
02524                 /* ** Shift all bytes of the image ** */
02525                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
02526                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
02527                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
02528                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
02529                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
02530                 "2:                     \n\t" 
02531                 "movq    (%%eax), %%mm0 \n\t"   /* load 8 bytes from SrcDest into MM0 */
02532                 "psrlw     %%mm3, %%mm0 \n\t"   /* shift 4 WORDS of MM0 (N) bits to the right */
02533                 /*    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0 */
02534                 ".byte     0x0f, 0xdb, 0xc1 \n\t" 
02535                 "movq    %%mm0, (%%edi) \n\t"   /* store result in SrcDest */
02536                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
02537                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
02538                 "dec              %%ecx \n\t"   /* decrease loop counter */
02539                 "jnz                 2b \n\t"   /* check loop termination, proceed if required */
02540                 "emms                   \n\t"   /* exit MMX state */
02541                 "popa                   \n\t":"=m" (Dest)       /* %0 */
02542                 :"m"(Src1),             /* %1 */
02543                 "m"(SrcLength),         /* %2 */
02544                 "m"(N),                 /* %3 */
02545                 "m"(Mask)                       /* %4 */
02546                 );
02547 #endif
02548         return (0);
02549 #else
02550         return (-1);
02551 #endif
02552 }
02553 
02564 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
02565 {
02566         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
02567         unsigned int i, istart;
02568         unsigned char *cursrc1;
02569         unsigned char *curdest;
02570 
02571         /* Validate input parameters */
02572         if ((Src1 == NULL) || (Dest == NULL))
02573                 return(-1);
02574         if (length == 0)
02575                 return(0);
02576 
02577         /* Check shift */
02578         if (N > 8) {
02579                 return (-1);
02580         }
02581 
02582         /* Special case: N==0 */
02583         if (N == 0) {
02584                 memcpy(Src1, Dest, length);
02585                 return (0); 
02586         }
02587 
02588         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02589 
02590                 /* MMX routine */
02591                 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
02592 
02593                 /* Check for unaligned bytes */
02594                 if ((length & 7) > 0) {
02595                         /* Setup to process unaligned bytes */
02596                         istart = length & 0xfffffff8;
02597                         cursrc1 = &Src1[istart];
02598                         curdest = &Dest[istart];
02599                 } else {
02600                         /* No unaligned bytes - we are done */
02601                         return (0);
02602                 }
02603         } else {
02604                 /* Setup to process whole image */
02605                 istart = 0;
02606                 cursrc1 = Src1;
02607                 curdest = Dest;
02608         }
02609 
02610         /* C routine to process image */
02611         for (i = istart; i < length; i++) {
02612                 *curdest = (unsigned char) *cursrc1 >> N;
02613                 /* Advance pointers */
02614                 cursrc1++;
02615                 curdest++;
02616         }
02617 
02618         return (0);
02619 }
02620 
02631 int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
02632 {
02633 #ifdef USE_MMX
02634 #if !defined(GCC__)
02635         __asm
02636         {
02637                 pusha
02638                         mov eax, Src1           /* load Src1 address into eax */
02639                         mov edi, Dest           /* load Dest address into edi */
02640                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02641                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02642                         align 16                        /* 16 byte alignment of the loop entry */
02643 L13023:
02644                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02645                 psrld mm0, N
02646                         movq [edi], mm0         /* store result in SrcDest */
02647                         add eax, 8      /* increase Src1 register pointer by 8 */
02648                         add edi, 8      /* increase Dest register pointer by 8 */
02649                         dec              ecx            /* decrease loop counter */
02650                         jnz             L13023          /* check loop termination, proceed if required */
02651                         emms                            /* exit MMX state */
02652                         popa
02653         }
02654 #else
02655         asm volatile
02656                 ("pusha              \n\t"
02657                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
02658                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
02659                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
02660                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
02661                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
02662                 "1: movq (%%eax), %%mm0 \n\t"   /* load 8 bytes from SrcDest into MM0 */
02663                 "psrld   %3, %%mm0 \n\t"
02664                 "movq    %%mm0, (%%edi) \n\t"   /* store result in SrcDest */
02665                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
02666                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
02667                 "dec              %%ecx \n\t"   /* decrease loop counter */
02668                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
02669                 "emms                   \n\t"   /* exit MMX state */
02670                 "popa                   \n\t":"=m" (Dest)       /* %0 */
02671                 :"m"(Src1),             /* %1 */
02672                 "m"(SrcLength),         /* %2 */
02673                 "m"(N)                  /* %3 */
02674                 );
02675 #endif
02676         return (0);
02677 #else
02678         return (-1);
02679 #endif
02680 }
02681 
02692 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
02693 {
02694         unsigned int i, istart;
02695         unsigned char *cursrc1, *curdest;
02696         unsigned int *icursrc1, *icurdest;
02697         unsigned int result;
02698 
02699         /* Validate input parameters */
02700         if ((Src1 == NULL) || (Dest == NULL))
02701                 return(-1);
02702         if (length == 0)
02703                 return(0);
02704 
02705         if (N > 32) {
02706                 return (-1);
02707         }
02708 
02709         /* Special case: N==0 */
02710         if (N == 0) {
02711                 memcpy(Src1, Dest, length);
02712                 return (0); 
02713         }
02714 
02715         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02716 
02717                 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
02718 
02719                 /* Check for unaligned bytes */
02720                 if ((length & 7) > 0) {
02721                         /* Setup to process unaligned bytes */
02722                         istart = length & 0xfffffff8;
02723                         cursrc1 = &Src1[istart];
02724                         curdest = &Dest[istart];
02725                 } else {
02726                         /* No unaligned bytes - we are done */
02727                         return (0);
02728                 }
02729         } else {
02730                 /* Setup to process whole image */
02731                 istart = 0;
02732                 cursrc1 = Src1;
02733                 curdest = Dest;
02734         }
02735 
02736         /* C routine to process image */
02737         icursrc1=(unsigned int *)cursrc1;
02738         icurdest=(unsigned int *)curdest;
02739         for (i = istart; i < length; i += 4) {
02740                 if ((i+4)<length) {
02741                         result = ((unsigned int)*icursrc1 >> N);
02742                         *icurdest = result;
02743                 }
02744                 /* Advance pointers */
02745                 icursrc1++;
02746                 icurdest++;
02747         }
02748 
02749         return (0);
02750 }
02751 
02762 int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
02763 {
02764 #ifdef USE_MMX
02765 #if !defined(GCC__)
02766         __asm
02767         {
02768                 pusha
02769                         /* ** Duplicate C in 4 words of MM1 ** */
02770                         mov al, C       /* load C into AL */
02771                         xor ah, ah      /* zero AH */
02772                         mov bx, ax      /* copy AX into BX */
02773                         shl eax, 16     /* shift 2 bytes of EAX left */
02774                         mov ax, bx      /* copy BX into AX */
02775                         movd mm1, eax           /* copy EAX into MM1 */
02776                         movd mm2, eax           /* copy EAX into MM2 */
02777                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */
02778                         pxor mm0, mm0           /* zero MM0 register */
02779                         mov eax, Src1           /* load Src1 address into eax */
02780                         mov edi, Dest           /* load Dest address into edi */
02781                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02782                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02783                         cmp al, 128     /* if (C <= 128) execute more efficient code */
02784                         jg             L10251
02785                         align 16                        /* 16 byte alignment of the loop entry */
02786 L10250:
02787                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02788                 movq mm4, mm3           /* copy MM3 into MM4  */
02789                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02790                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02791                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */
02792                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */
02793                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02794                         movq [edi], mm3         /* store result in Dest */
02795                         add eax, 8      /* increase Src1 register pointer by 8 */
02796                         add edi, 8      /* increase Dest register pointer by 8 */
02797                         dec              ecx            /* decrease loop counter */
02798                         jnz            L10250           /* check loop termination, proceed if required */
02799                         jmp            L10252
02800                         align 16                        /* 16 byte alignment of the loop entry */
02801 L10251:
02802                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02803                 movq mm4, mm3           /* copy MM3 into MM4  */
02804                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02805                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02806                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */
02807                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */
02808                         /* ** Take abs value of the results (signed words) ** */
02809                         movq mm5, mm3           /* copy mm3 into mm5 */
02810                         movq mm6, mm4           /* copy mm4 into mm6 */
02811                         psraw mm5, 15           /* fill mm5 words with word sign bit */
02812                         psraw mm6, 15           /* fill mm6 words with word sign bit */
02813                         pxor mm3, mm5           /* take 1's compliment of only neg words */
02814                         pxor mm4, mm6           /* take 1's compliment of only neg words */
02815                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
02816                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
02817                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02818                         movq [edi], mm3         /* store result in Dest */
02819                         add eax, 8      /* increase Src1 register pointer by 8 */
02820                         add edi, 8      /* increase Dest register pointer by 8 */
02821                         dec              ecx            /* decrease loop counter */
02822                         jnz            L10251           /* check loop termination, proceed if required */
02823 L10252:
02824                 emms                            /* exit MMX state */
02825                         popa
02826         }
02827 #else
02828         asm volatile
02829                 ("pusha              \n\t"
02830                 /* ** Duplicate C in 4 words of MM1 ** */
02831                 "mov           %3, %%al \n\t"   /* load C into AL */
02832                 "xor         %%ah, %%ah \n\t"   /* zero AH */
02833                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
02834                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
02835                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
02836                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
02837                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
02838                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher words of MM1 with C */
02839                 "pxor      %%mm0, %%mm0 \n\t"   /* zero MM0 register */
02840                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
02841                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
02842                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
02843                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
02844                 "cmp         $128, %%al \n\t"   /* if (C <= 128) execute more efficient code */
02845                 "jg                  2f \n\t" ".align 16              \n\t"     /* 16 byte alignment of the loop entry */
02846                 "1: movq (%%eax), %%mm3 \n\t"   /* load 8 bytes from Src1 into MM3 */
02847                 "movq      %%mm3, %%mm4 \n\t"   /* copy MM3 into MM4  */
02848                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of SrcDest into words */
02849                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of SrcDest into words */
02850                 "pmullw    %%mm1, %%mm3 \n\t"   /* mul low  bytes of SrcDest and MM1 */
02851                 "pmullw    %%mm1, %%mm4 \n\t"   /* mul high bytes of SrcDest and MM1 */
02852                 "packuswb  %%mm4, %%mm3 \n\t"   /* pack words back into bytes with saturation */
02853                 "movq    %%mm3, (%%edi) \n\t"   /* store result in Dest */
02854                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
02855                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
02856                 "dec              %%ecx \n\t"   /* decrease loop counter */
02857                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
02858                 "jmp                 3f \n\t" ".align 16              \n\t"     /* 16 byte alignment of the loop entry */
02859                 "2: movq (%%eax), %%mm3 \n\t"   /* load 8 bytes from Src1 into MM3 */
02860                 "movq      %%mm3, %%mm4 \n\t"   /* copy MM3 into MM4  */
02861                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of SrcDest into words */
02862                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of SrcDest into words */
02863                 "pmullw    %%mm1, %%mm3 \n\t"   /* mul low  bytes of SrcDest and MM1 */
02864                 "pmullw    %%mm1, %%mm4 \n\t"   /* mul high bytes of SrcDest and MM1 */
02865                 /* ** Take abs value of the results (signed words) ** */
02866                 "movq      %%mm3, %%mm5 \n\t"   /* copy mm3 into mm5 */
02867                 "movq      %%mm4, %%mm6 \n\t"   /* copy mm4 into mm6 */
02868                 "psraw       $15, %%mm5 \n\t"   /* fill mm5 words with word sign bit */
02869                 "psraw       $15, %%mm6 \n\t"   /* fill mm6 words with word sign bit */
02870                 "pxor      %%mm5, %%mm3 \n\t"   /* take 1's compliment of only neg. words */
02871                 "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
02872                 "psubsw    %%mm5, %%mm3 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
02873                 "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
02874                 "packuswb  %%mm4, %%mm3 \n\t"   /* pack words back into bytes with saturation */
02875                 "movq    %%mm3, (%%edi) \n\t"   /* store result in Dest */
02876                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
02877                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
02878                 "dec              %%ecx \n\t"   /* decrease loop counter */
02879                 "jnz                 2b \n\t"   /* check loop termination, proceed if required */
02880                 "3: emms               \n\t"    /* exit MMX state */
02881                 "popa                   \n\t":"=m" (Dest)       /* %0 */
02882                 :"m"(Src1),             /* %1 */
02883                 "m"(SrcLength),         /* %2 */
02884                 "m"(C)                  /* %3 */
02885                 );
02886 #endif
02887         return (0);
02888 #else
02889         return (-1);
02890 #endif
02891 }
02892 
02903 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02904 {
02905         unsigned int i, istart;
02906         int iC;
02907         unsigned char *cursrc1;
02908         unsigned char *curdest;
02909         int result;
02910 
02911         /* Validate input parameters */
02912         if ((Src1 == NULL) || (Dest == NULL))
02913                 return(-1);
02914         if (length == 0)
02915                 return(0);
02916 
02917         /* Special case: C==1 */
02918         if (C == 1) {
02919                 memcpy(Src1, Dest, length);
02920                 return (0); 
02921         }
02922 
02923         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02924 
02925                 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
02926 
02927                 /* Check for unaligned bytes */
02928                 if ((length & 7) > 0) {
02929                         /* Setup to process unaligned bytes */
02930                         istart = length & 0xfffffff8;
02931                         cursrc1 = &Src1[istart];
02932                         curdest = &Dest[istart];
02933                 } else {
02934                         /* No unaligned bytes - we are done */
02935                         return (0);
02936                 }
02937         } else {
02938                 /* Setup to process whole image */
02939                 istart = 0;
02940                 cursrc1 = Src1;
02941                 curdest = Dest;
02942         }
02943 
02944         /* C routine to process image */
02945         iC = (int) C;
02946         for (i = istart; i < length; i++) {
02947                 result = (int) *cursrc1 * iC;
02948                 if (result > 255)
02949                         result = 255;
02950                 *curdest = (unsigned char) result;
02951                 /* Advance pointers */
02952                 cursrc1++;
02953                 curdest++;
02954         }
02955 
02956         return (0);
02957 }
02958 
02970 int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
02971                                                                                           unsigned char C)
02972 {
02973 #ifdef USE_MMX
02974 #if !defined(GCC__)
02975         __asm
02976         {
02977                 pusha
02978                         /* ** Duplicate C in 4 words of MM1 ** */
02979                         mov al, C       /* load C into AL */
02980                         xor ah, ah      /* zero AH */
02981                         mov bx, ax      /* copy AX into BX */
02982                         shl eax, 16     /* shift 2 bytes of EAX left */
02983                         mov ax, bx      /* copy BX into AX */
02984                         movd mm1, eax           /* copy EAX into MM1 */
02985                         movd mm2, eax           /* copy EAX into MM2 */
02986                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */
02987                         xor ecx, ecx    /* zero ECX */
02988                         mov cl, N       /* load N into CL */
02989                         movd mm7, ecx           /* copy N into MM7 */
02990                         pxor mm0, mm0           /* zero MM0 register */
02991                         mov eax, Src1           /* load Src1 address into eax */
02992                         mov edi, Dest           /* load Dest address into edi */
02993                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02994                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02995                         align 16                        /* 16 byte alignment of the loop entry */
02996 L1026:
02997                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02998                 movq mm4, mm3           /* copy MM3 into MM4  */
02999                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
03000                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
03001                         psrlw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the right */
03002                         psrlw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the right */
03003                         pmullw mm3, mm1         /* mul low  bytes of SrcDest by MM1 */
03004                         pmullw mm4, mm1         /* mul high bytes of SrcDest by MM1 */
03005                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03006                         movq [edi], mm3         /* store result in Dest */
03007                         add eax, 8      /* increase Src1 register pointer by 8 */
03008                         add edi, 8      /* increase Dest register pointer by 8 */
03009                         dec              ecx            /* decrease loop counter */
03010                         jnz             L1026           /* check loop termination, proceed if required */
03011                         emms                            /* exit MMX state */
03012                         popa
03013         }
03014 #else
03015         asm volatile
03016                 ("pusha              \n\t"
03017                 /* ** Duplicate C in 4 words of MM1 ** */
03018                 "mov           %4, %%al \n\t"   /* load C into AL */
03019                 "xor         %%ah, %%ah \n\t"   /* zero AH */
03020                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
03021                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
03022                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
03023                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
03024                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
03025                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher words of MM1 with C */
03026                 "xor       %%ecx, %%ecx \n\t"   /* zero ECX */
03027                 "mov           %3, %%cl \n\t"   /* load N into CL */
03028                 "movd      %%ecx, %%mm7 \n\t"   /* copy N into MM7 */
03029                 "pxor      %%mm0, %%mm0 \n\t"   /* zero MM0 register */
03030                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
03031                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
03032                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
03033                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
03034                 ".align 16             \n\t"    /* 16 byte alignment of the loop entry */
03035                 "1: movq (%%eax), %%mm3 \n\t"   /* load 8 bytes from Src1 into MM3 */
03036                 "movq      %%mm3, %%mm4 \n\t"   /* copy MM3 into MM4  */
03037                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of SrcDest into words */
03038                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of SrcDest into words */
03039                 "psrlw     %%mm7, %%mm3 \n\t"   /* shift 4 WORDS of MM3 (N) bits to the right */
03040                 "psrlw     %%mm7, %%mm4 \n\t"   /* shift 4 WORDS of MM4 (N) bits to the right */
03041                 "pmullw    %%mm1, %%mm3 \n\t"   /* mul low  bytes of SrcDest by MM1 */
03042                 "pmullw    %%mm1, %%mm4 \n\t"   /* mul high bytes of SrcDest by MM1 */
03043                 "packuswb  %%mm4, %%mm3 \n\t"   /* pack words back into bytes with saturation */
03044                 "movq    %%mm3, (%%edi) \n\t"   /* store result in Dest */
03045                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
03046                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
03047                 "dec              %%ecx \n\t"   /* decrease loop counter */
03048                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
03049                 "emms                   \n\t"   /* exit MMX state */
03050                 "popa                   \n\t":"=m" (Dest)       /* %0 */
03051                 :"m"(Src1),             /* %1 */
03052                 "m"(SrcLength),         /* %2 */
03053                 "m"(N),                 /* %3 */
03054                 "m"(C)                  /* %4 */
03055                 );
03056 #endif
03057         return (0);
03058 #else
03059         return (-1);
03060 #endif
03061 }
03062 
03074 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
03075                                                                                    unsigned char C)
03076 {
03077         unsigned int i, istart;
03078         int iC;
03079         unsigned char *cursrc1;
03080         unsigned char *curdest;
03081         int result;
03082 
03083         /* Validate input parameters */
03084         if ((Src1 == NULL) || (Dest == NULL))
03085                 return(-1);
03086         if (length == 0)
03087                 return(0);
03088 
03089         /* Check shift */
03090         if (N > 8) {
03091                 return (-1);
03092         }
03093 
03094         /* Special case: N==0 && C==1 */
03095         if ((N == 0) && (C == 1)) {
03096                 memcpy(Src1, Dest, length);
03097                 return (0); 
03098         }
03099 
03100         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03101 
03102                 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
03103 
03104                 /* Check for unaligned bytes */
03105                 if ((length & 7) > 0) {
03106                         /* Setup to process unaligned bytes */
03107                         istart = length & 0xfffffff8;
03108                         cursrc1 = &Src1[istart];
03109                         curdest = &Dest[istart];
03110                 } else {
03111                         /* No unaligned bytes - we are done */
03112                         return (0);
03113                 }
03114         } else {
03115                 /* Setup to process whole image */
03116                 istart = 0;
03117                 cursrc1 = Src1;
03118                 curdest = Dest;
03119         }
03120 
03121         /* C routine to process image */
03122         iC = (int) C;
03123         for (i = istart; i < length; i++) {
03124                 result = (int) (*cursrc1 >> N) * iC;
03125                 if (result > 255)
03126                         result = 255;
03127                 *curdest = (unsigned char) result;
03128                 /* Advance pointers */
03129                 cursrc1++;
03130                 curdest++;
03131         }
03132 
03133         return (0);
03134 }
03135 
03147 int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
03148                                                                         unsigned char *Mask)
03149 {
03150 #ifdef USE_MMX
03151 #if !defined(GCC__)
03152         __asm
03153         {
03154                 pusha
03155                         mov edx, Mask           /* load Mask address into edx */
03156                         movq mm0, [edx]         /* load Mask into mm0 */
03157                 xor ecx, ecx    /* zero ECX */
03158                         mov cl, N       /* load loop counter (N) into CL */
03159                         movd mm3, ecx           /* copy (N) into MM3  */
03160                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03161 L10270:                         /* ** Prepare proper bit-Mask in MM1 ** */
03162                 psllw mm1, 1    /* shift 4 WORDS of MM1 1 bit to the left */
03163                         pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
03164                         /*  byte     0x0f, 0xdb, 0xc8 */
03165                         dec cl                          /* decrease loop counter */
03166                         jnz            L10270           /* check loop termination, proceed if required */
03167                         /* ** Shift all bytes of the image ** */
03168                         mov eax, Src1           /* load Src1 address into eax */
03169                         mov edi, Dest           /* load SrcDest address into edi */
03170                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03171                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03172                         align 16                        /* 16 byte alignment of the loop entry */
03173 L10271:
03174                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
03175                 psllw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the left */
03176                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
03177                         /* byte     0x0f, 0xdb, 0xc1 */
03178                         movq [edi], mm0         /* store result in Dest */
03179                         add eax, 8      /* increase Src1 register pointer by 8 */
03180                         add edi, 8      /* increase Dest register pointer by 8 */
03181                         dec              ecx            /* decrease loop counter */
03182                         jnz            L10271           /* check loop termination, proceed if required */
03183                         emms                            /* exit MMX state */
03184                         popa
03185         }
03186 #else
03187         asm volatile
03188                 ("pusha              \n\t" "movl         %4, %%edx \n\t"        /* load Mask address into edx */
03189                 "movq    (%%edx), %%mm0 \n\t"   /* load Mask into mm0 */
03190                 "xor       %%ecx, %%ecx \n\t"   /* zero ECX */
03191                 "mov           %3, %%cl \n\t"   /* load loop counter (N) into CL */
03192                 "movd      %%ecx, %%mm3 \n\t"   /* copy (N) into MM3  */
03193                 "pcmpeqb   %%mm1, %%mm1 \n\t"   /* generate all 1's in mm1 */
03194                 "1:                     \n\t"   /* ** Prepare proper bit-Mask in MM1 ** */
03195                 "psllw        $1, %%mm1 \n\t"   /* shift 4 WORDS of MM1 1 bit to the left */
03196                 /*    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1 */
03197                 ".byte     0x0f, 0xdb, 0xc8 \n\t" "dec %%cl               \n\t" /* decrease loop counter */
03198                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
03199                 /* ** Shift all bytes of the image ** */
03200                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
03201                 "mov          %0, %%edi \n\t"   /* load SrcDest address into edi */
03202                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
03203                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
03204                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
03205                 "2: movq (%%eax), %%mm0 \n\t"   /* load 8 bytes from Src1 into MM0 */
03206                 "psllw     %%mm3, %%mm0 \n\t"   /* shift 4 WORDS of MM0 (N) bits to the left */
03207                 /*    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0 */
03208                 ".byte     0x0f, 0xdb, 0xc1 \n\t" "movq    %%mm0, (%%edi) \n\t" /* store result in Dest */
03209                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
03210                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
03211                 "dec              %%ecx \n\t"   /* decrease loop counter */
03212                 "jnz                 2b \n\t"   /* check loop termination, proceed if required */
03213                 "emms                   \n\t"   /* exit MMX state */
03214                 "popa                   \n\t":"=m" (Dest)       /* %0 */
03215                 :"m"(Src1),             /* %1 */
03216                 "m"(SrcLength),         /* %2 */
03217                 "m"(N),                 /* %3 */
03218                 "m"(Mask)                       /* %4 */
03219                 );
03220 #endif
03221         return (0);
03222 #else
03223         return (-1);
03224 #endif
03225 }
03226 
03237 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03238 {
03239         static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
03240         unsigned int i, istart;
03241         unsigned char *cursrc1, *curdest;
03242         int result;
03243 
03244         /* Validate input parameters */
03245         if ((Src1 == NULL) || (Dest == NULL))
03246                 return(-1);
03247         if (length == 0)
03248                 return(0);
03249 
03250         if (N > 8) {
03251                 return (-1);
03252         }
03253 
03254         /* Special case: N==0 */
03255         if (N == 0) {
03256                 memcpy(Src1, Dest, length);
03257                 return (0); 
03258         }
03259 
03260         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03261 
03262                 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
03263 
03264                 /* Check for unaligned bytes */
03265                 if ((length & 7) > 0) {
03266                         /* Setup to process unaligned bytes */
03267                         istart = length & 0xfffffff8;
03268                         cursrc1 = &Src1[istart];
03269                         curdest = &Dest[istart];
03270                 } else {
03271                         /* No unaligned bytes - we are done */
03272                         return (0);
03273                 }
03274         } else {
03275                 /* Setup to process whole image */
03276                 istart = 0;
03277                 cursrc1 = Src1;
03278                 curdest = Dest;
03279         }
03280 
03281         /* C routine to process image */
03282         for (i = istart; i < length; i++) {
03283                 result = ((int) *cursrc1 << N) & 0xff;
03284                 *curdest = (unsigned char) result;
03285                 /* Advance pointers */
03286                 cursrc1++;
03287                 curdest++;
03288         }
03289 
03290         return (0);
03291 }
03292 
03303 int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
03304 {
03305 #ifdef USE_MMX
03306 #if !defined(GCC__)
03307         __asm
03308         {
03309                 pusha
03310                         mov eax, Src1           /* load Src1 address into eax */
03311                         mov edi, Dest           /* load Dest address into edi */
03312                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03313                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03314                         align 16                        /* 16 byte alignment of the loop entry */
03315 L12023:
03316                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
03317                 pslld mm0, N    /* MM0=SrcDest+C (add 8 bytes with saturation) */
03318                         movq [edi], mm0         /* store result in SrcDest */
03319                         add eax, 8      /* increase Src1 register pointer by 8 */
03320                         add edi, 8      /* increase Dest register pointer by 8 */
03321                         dec              ecx            /* decrease loop counter */
03322                         jnz             L12023          /* check loop termination, proceed if required */
03323                         emms                            /* exit MMX state */
03324                         popa
03325         }
03326 #else
03327         asm volatile
03328                 ("pusha              \n\t"
03329                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
03330                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
03331                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
03332                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
03333                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
03334                 "1: movq (%%eax), %%mm0 \n\t"   /* load 8 bytes from SrcDest into MM0 */
03335                 "pslld   %3, %%mm0 \n\t"        /* MM0=SrcDest+C (add 8 bytes with saturation) */
03336                 "movq    %%mm0, (%%edi) \n\t"   /* store result in SrcDest */
03337                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
03338                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
03339                 "dec              %%ecx \n\t"   /* decrease loop counter */
03340                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
03341                 "emms                   \n\t"   /* exit MMX state */
03342                 "popa                   \n\t":"=m" (Dest)       /* %0 */
03343                 :"m"(Src1),             /* %1 */
03344                 "m"(SrcLength),         /* %2 */
03345                 "m"(N)                  /* %3 */
03346                 );
03347 #endif
03348         return (0);
03349 #else
03350         return (-1);
03351 #endif
03352 }
03353 
03364 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03365 {
03366         unsigned int i, istart;
03367         unsigned char *cursrc1, *curdest;
03368         unsigned int *icursrc1, *icurdest;
03369         unsigned int result;
03370 
03371         /* Validate input parameters */
03372         if ((Src1 == NULL) || (Dest == NULL))
03373                 return(-1);
03374         if (length == 0)
03375                 return(0);
03376 
03377         if (N > 32) {
03378                 return (-1);
03379         }
03380 
03381         /* Special case: N==0 */
03382         if (N == 0) {
03383                 memcpy(Src1, Dest, length);
03384                 return (0); 
03385         }
03386 
03387         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03388 
03389                 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
03390 
03391                 /* Check for unaligned bytes */
03392                 if ((length & 7) > 0) {
03393                         /* Setup to process unaligned bytes */
03394                         istart = length & 0xfffffff8;
03395                         cursrc1 = &Src1[istart];
03396                         curdest = &Dest[istart];
03397                 } else {
03398                         /* No unaligned bytes - we are done */
03399                         return (0);
03400                 }
03401         } else {
03402                 /* Setup to process whole image */
03403                 istart = 0;
03404                 cursrc1 = Src1;
03405                 curdest = Dest;
03406         }
03407 
03408         /* C routine to process image */
03409         icursrc1=(unsigned int *)cursrc1;
03410         icurdest=(unsigned int *)curdest;
03411         for (i = istart; i < length; i += 4) {
03412                 if ((i+4)<length) {
03413                         result = ((unsigned int)*icursrc1 << N);
03414                         *icurdest = result;
03415                 }
03416                 /* Advance pointers */
03417                 icursrc1++;
03418                 icurdest++;
03419         }
03420 
03421         return (0);
03422 }
03423 
03434 int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
03435 {
03436 #ifdef USE_MMX
03437 #if !defined(GCC__)
03438         __asm
03439         {
03440                 pusha
03441                         xor eax, eax    /* zero EAX */
03442                         mov al, N       /* load N into AL */
03443                         movd mm7, eax           /* copy N into MM7 */
03444                         pxor mm0, mm0           /* zero MM0 register */
03445                         mov eax, Src1           /* load Src1 address into eax */
03446                         mov edi, Dest           /* load Dest address into edi */
03447                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03448                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03449                         cmp al, 7       /* if (N <= 7) execute more efficient code */
03450                         jg             L10281
03451                         align 16                        /* 16 byte alignment of the loop entry */
03452 L10280:
03453                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03454                 movq mm4, mm3           /* copy MM3 into MM4  */
03455                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
03456                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
03457                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the right */
03458                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the right */
03459                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03460                         movq [edi], mm3         /* store result in Dest */
03461                         add eax, 8      /* increase Src1 register pointer by 8 */
03462                         add edi, 8      /* increase Dest register pointer by 8 */
03463                         dec              ecx            /* decrease loop counter */
03464                         jnz            L10280           /* check loop termination, proceed if required */
03465                         jmp            L10282
03466                         align 16                        /* 16 byte alignment of the loop entry */
03467 L10281:
03468                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03469                 movq mm4, mm3           /* copy MM3 into MM4  */
03470                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
03471                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
03472                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the right */
03473                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the right */
03474                         /* ** Take abs value of the signed words ** */
03475                         movq mm5, mm3           /* copy mm3 into mm5 */
03476                         movq mm6, mm4           /* copy mm4 into mm6 */
03477                         psraw mm5, 15           /* fill mm5 words with word sign bit */
03478                         psraw mm6, 15           /* fill mm6 words with word sign bit */
03479                         pxor mm3, mm5           /* take 1's compliment of only neg words */
03480                         pxor mm4, mm6           /* take 1's compliment of only neg words */
03481                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
03482                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
03483                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03484                         movq [edi], mm3         /* store result in Dest */
03485                         add eax, 8      /* increase Src1 register pointer by 8 */
03486                         add edi, 8      /* increase Dest register pointer by 8 */
03487                         dec              ecx            /* decrease loop counter */
03488                         jnz            L10281           /* check loop termination, proceed if required */
03489 L10282:
03490                 emms                            /* exit MMX state */
03491                         popa
03492         }
03493 #else
03494         asm volatile
03495                 ("pusha              \n\t" "xor       %%eax, %%eax \n\t"        /* zero EAX */
03496                 "mov           %3, %%al \n\t"   /* load N into AL */
03497                 "movd      %%eax, %%mm7 \n\t"   /* copy N into MM7 */
03498                 "pxor      %%mm0, %%mm0 \n\t"   /* zero MM0 register */
03499                 "mov         %1, %%eax  \n\t"   /* load Src1 address into eax */
03500                 "mov         %0, %%edi  \n\t"   /* load Dest address into edi */
03501                 "mov         %2, %%ecx  \n\t"   /* load loop counter (SIZE) into ecx */
03502                 "shr         $3, %%ecx  \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
03503                 "cmp           $7, %%al \n\t"   /* if (N <= 7) execute more efficient code */
03504                 "jg                  2f \n\t" ".align 16              \n\t"     /* 16 byte alignment of the loop entry */
03505                 "1: movq (%%eax), %%mm3 \n\t"   /* load 8 bytes from Src1 into MM3 */
03506                 "movq      %%mm3, %%mm4 \n\t"   /* copy MM3 into MM4  */
03507                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of SrcDest into words */
03508                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of SrcDest into words */
03509                 "psllw     %%mm7, %%mm3 \n\t"   /* shift 4 WORDS of MM3 (N) bits to the right */
03510                 "psllw     %%mm7, %%mm4 \n\t"   /* shift 4 WORDS of MM4 (N) bits to the right */
03511                 "packuswb  %%mm4, %%mm3 \n\t"   /* pack words back into bytes with saturation */
03512                 "movq    %%mm3, (%%edi) \n\t"   /* store result in Dest */
03513                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
03514                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
03515                 "dec              %%ecx \n\t"   /* decrease loop counter */
03516                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
03517                 "jmp                 3f \n\t" ".align 16              \n\t"     /* 16 byte alignment of the loop entry */
03518                 "2: movq (%%eax), %%mm3 \n\t"   /* load 8 bytes from Src1 into MM3 */
03519                 "movq      %%mm3, %%mm4 \n\t"   /* copy MM3 into MM4  */
03520                 "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack low  bytes of SrcDest into words */
03521                 "punpckhbw %%mm0, %%mm4 \n\t"   /* unpack high bytes of SrcDest into words */
03522                 "psllw     %%mm7, %%mm3 \n\t"   /* shift 4 WORDS of MM3 (N) bits to the right */
03523                 "psllw     %%mm7, %%mm4 \n\t"   /* shift 4 WORDS of MM4 (N) bits to the right */
03524                 /* ** Take abs value of the signed words ** */
03525                 "movq      %%mm3, %%mm5 \n\t"   /* copy mm3 into mm5 */
03526                 "movq      %%mm4, %%mm6 \n\t"   /* copy mm4 into mm6 */
03527                 "psraw       $15, %%mm5 \n\t"   /* fill mm5 words with word sign bit */
03528                 "psraw       $15, %%mm6 \n\t"   /* fill mm6 words with word sign bit */
03529                 "pxor      %%mm5, %%mm3 \n\t"   /* take 1's compliment of only neg. words */
03530                 "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
03531                 "psubsw    %%mm5, %%mm3 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
03532                 "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
03533                 "packuswb  %%mm4, %%mm3 \n\t"   /* pack words back into bytes with saturation */
03534                 "movq    %%mm3, (%%edi) \n\t"   /* store result in Dest */
03535                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
03536                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
03537                 "dec              %%ecx \n\t"   /* decrease loop counter */
03538                 "jnz                 2b \n\t"   /* check loop termination, proceed if required */
03539                 "3: emms                \n\t"   /* exit MMX state */
03540                 "popa                   \n\t":"=m" (Dest)       /* %0 */
03541                 :"m"(Src1),             /* %1 */
03542                 "m"(SrcLength),         /* %2 */
03543                 "m"(N)                  /* %3 */
03544                 );
03545 #endif
03546         return (0);
03547 #else
03548         return (-1);
03549 #endif
03550 }
03551 
03562 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03563 {
03564         unsigned int i, istart;
03565         unsigned char *cursrc1, *curdest;
03566         int result;
03567 
03568         /* Validate input parameters */
03569         if ((Src1 == NULL) || (Dest == NULL))
03570                 return(-1);
03571         if (length == 0)
03572                 return(0);
03573 
03574         if (N > 8) {
03575                 return (-1);
03576         }
03577 
03578         /* Special case: N==0 */
03579         if (N == 0) {
03580                 memcpy(Src1, Dest, length);
03581                 return (0); 
03582         }
03583 
03584         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03585 
03586                 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
03587 
03588                 /* Check for unaligned bytes */
03589                 if ((length & 7) > 0) {
03590                         /* Setup to process unaligned bytes */
03591                         istart = length & 0xfffffff8;
03592                         cursrc1 = &Src1[istart];
03593                         curdest = &Dest[istart];
03594                 } else {
03595                         /* No unaligned bytes - we are done */
03596                         return (0);
03597                 }
03598         } else {
03599                 /* Setup to process whole image */
03600                 istart = 0;
03601                 cursrc1 = Src1;
03602                 curdest = Dest;
03603         }
03604 
03605         /* C routine to process image */
03606         for (i = istart; i < length; i++) {
03607                 result = (int) *cursrc1 << N;
03608                 if (result > 255)
03609                         result = 255;
03610                 *curdest = (unsigned char) result;
03611                 /* Advance pointers */
03612                 cursrc1++;
03613                 curdest++;
03614         }
03615 
03616         return (0);
03617 }
03618 
03629 int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
03630 {
03631 #ifdef USE_MMX
03632 #if !defined(GCC__)
03633         __asm
03634         {
03635                 pusha
03636                         /* ** Duplicate T in 8 bytes of MM3 ** */
03637                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03638                         pcmpeqb mm2, mm2        /* generate all 1's in mm2 */
03639                         mov al, T       /* load T into AL */
03640                         mov ah, al      /* copy AL into AH */
03641                         mov bx, ax      /* copy AX into BX */
03642                         shl eax, 16     /* shift 2 bytes of EAX left */
03643                         mov ax, bx      /* copy BX into AX */
03644                         movd mm3, eax           /* copy EAX into MM3 */
03645                         movd mm4, eax           /* copy EAX into MM4 */
03646                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with T */
03647                         psubusb mm2, mm3        /* store 0xFF - T in MM2 */
03648                         mov eax, Src1           /* load Src1 address into eax */
03649                         mov edi, Dest           /* load Dest address into edi */
03650                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03651                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03652                         align 16                        /* 16 byte alignment of the loop entry */
03653 L1029:
03654                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
03655                 paddusb mm0, mm2        /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
03656                         pcmpeqb mm0, mm1        /* binarize 255:0, comparing to 255 */
03657                         movq [edi], mm0         /* store result in SrcDest */
03658                         add eax, 8      /* increase Src1 register pointer by 8 */
03659                         add edi, 8      /* increase Dest register pointer by 8 */
03660                         dec              ecx            /* decrease loop counter */
03661                         jnz             L1029           /* check loop termination, proceed if required */
03662                         emms                            /* exit MMX state */
03663                         popa
03664         }
03665 #else
03666         asm volatile
03667                 ("pusha              \n\t"
03668                 /* ** Duplicate T in 8 bytes of MM3 ** */
03669                 "pcmpeqb   %%mm1, %%mm1 \n\t"   /* generate all 1's in mm1 */
03670                 "pcmpeqb   %%mm2, %%mm2 \n\t"   /* generate all 1's in mm2 */
03671                 "mov           %3, %%al \n\t"   /* load T into AL */
03672                 "mov         %%al, %%ah \n\t"   /* copy AL into AH */
03673                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
03674                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
03675                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
03676                 "movd      %%eax, %%mm3 \n\t"   /* copy EAX into MM3 */
03677                 "movd      %%eax, %%mm4 \n\t"   /* copy EAX into MM4 */
03678                 "punpckldq %%mm4, %%mm3 \n\t"   /* fill higher bytes of MM3 with T */
03679                 "psubusb   %%mm3, %%mm2 \n\t"   /* store 0xFF - T in MM2 */
03680                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
03681                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
03682                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
03683                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
03684                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
03685                 "1:                     \n\t" 
03686                 "movq    (%%eax), %%mm0 \n\t"   /* load 8 bytes from SrcDest into MM0 */
03687                 "paddusb   %%mm2, %%mm0 \n\t"   /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
03688                 "pcmpeqb   %%mm1, %%mm0 \n\t"   /* binarize 255:0, comparing to 255 */
03689                 "movq    %%mm0, (%%edi) \n\t"   /* store result in SrcDest */
03690                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
03691                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
03692                 "dec              %%ecx \n\t"   /* decrease loop counter */
03693                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
03694                 "emms                   \n\t"   /* exit MMX state */
03695                 "popa                   \n\t":"=m" (Dest)       /* %0 */
03696                 :"m"(Src1),             /* %1 */
03697                 "m"(SrcLength),         /* %2 */
03698                 "m"(T)                  /* %3 */
03699                 );
03700 #endif
03701         return (0);
03702 #else
03703         return (-1);
03704 #endif
03705 }
03706 
03717 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
03718 {
03719         unsigned int i, istart;
03720         unsigned char *cursrc1;
03721         unsigned char *curdest;
03722 
03723         /* Validate input parameters */
03724         if ((Src1 == NULL) || (Dest == NULL))
03725                 return(-1);
03726         if (length == 0)
03727                 return(0);
03728 
03729         /* Special case: T==0 */
03730         if (T == 0) {
03731                 memset(Dest, 255, length);
03732                 return (0); 
03733         }
03734 
03735         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03736 
03737                 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
03738 
03739                 /* Check for unaligned bytes */
03740                 if ((length & 7) > 0) {
03741                         /* Setup to process unaligned bytes */
03742                         istart = length & 0xfffffff8;
03743                         cursrc1 = &Src1[istart];
03744                         curdest = &Dest[istart];
03745                 } else {
03746                         /* No unaligned bytes - we are done */
03747                         return (0);
03748                 }
03749         } else {
03750                 /* Setup to process whole image */
03751                 istart = 0;
03752                 cursrc1 = Src1;
03753                 curdest = Dest;
03754         }
03755 
03756         /* C routine to process image */
03757         for (i = istart; i < length; i++) {
03758                 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
03759                 /* Advance pointers */
03760                 cursrc1++;
03761                 curdest++;
03762         }
03763 
03764         return (0);
03765 }
03766 
03778 int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
03779                                                                   unsigned char Tmax)
03780 {
03781 #ifdef USE_MMX
03782 #if !defined(GCC__)
03783         __asm
03784         {
03785                 pusha
03786                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03787                         /* ** Duplicate Tmax in 8 bytes of MM3 ** */
03788                         mov al, Tmax    /* load Tmax into AL */
03789                         mov ah, al      /* copy AL into AH */
03790                         mov bx, ax      /* copy AX into BX */
03791                         shl eax, 16     /* shift 2 bytes of EAX left */
03792                         mov ax, bx      /* copy BX into AX */
03793                         movd mm3, eax           /* copy EAX into MM3 */
03794                         movd mm4, eax           /* copy EAX into MM4 */
03795                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with Tmax */
03796                         psubusb mm1, mm3        /* store 0xFF - Tmax in MM1 */
03797                         /* ** Duplicate Tmin in 8 bytes of MM5 ** */
03798                         mov al, Tmin    /* load Tmin into AL */
03799                         mov ah, al      /* copy AL into AH */
03800                         mov bx, ax      /* copy AX into BX */
03801                         shl eax, 16     /* shift 2 bytes of EAX left */
03802                         mov ax, bx      /* copy BX into AX */
03803                         movd mm5, eax           /* copy EAX into MM5 */
03804                         movd mm4, eax           /* copy EAX into MM4 */
03805                         punpckldq mm5, mm4      /* fill higher bytes of MM5 with Tmin */
03806                         movq mm7, mm5           /* copy MM5 into MM7 */
03807                         paddusb mm7, mm1        /* store 0xFF - Tmax + Tmin in MM7 */
03808                         mov eax, Src1           /* load Src1 address into eax */
03809                         mov edi, Dest           /* load Dest address into edi */
03810                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03811                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03812                         align 16                        /* 16 byte alignment of the loop entry */
03813 L1030:
03814                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
03815                 paddusb mm0, mm1        /* MM0=SrcDest+(0xFF-Tmax) */
03816                         psubusb mm0, mm7        /* MM0=MM0-(0xFF-Tmax+Tmin) */
03817                         paddusb mm0, mm5        /* MM0=MM0+Tmin */
03818                         movq [edi], mm0         /* store result in Dest */
03819                         add eax, 8      /* increase Src1 register pointer by 8 */
03820                         add edi, 8      /* increase Dest register pointer by 8 */
03821                         dec              ecx            /* decrease loop counter */
03822                         jnz             L1030           /* check loop termination, proceed if required */
03823                         emms                            /* exit MMX state */
03824                         popa
03825         }
03826 #else
03827         asm volatile
03828                 ("pusha              \n\t" "pcmpeqb   %%mm1, %%mm1 \n\t"        /* generate all 1's in mm1 */
03829                 /* ** Duplicate Tmax in 8 bytes of MM3 ** */
03830                 "mov           %4, %%al \n\t"   /* load Tmax into AL */
03831                 "mov         %%al, %%ah \n\t"   /* copy AL into AH */
03832                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
03833                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
03834                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
03835                 "movd      %%eax, %%mm3 \n\t"   /* copy EAX into MM3 */
03836                 "movd      %%eax, %%mm4 \n\t"   /* copy EAX into MM4 */
03837                 "punpckldq %%mm4, %%mm3 \n\t"   /* fill higher bytes of MM3 with Tmax */
03838                 "psubusb   %%mm3, %%mm1 \n\t"   /* store 0xFF - Tmax in MM1 */
03839                 /* ** Duplicate Tmin in 8 bytes of MM5 ** */
03840                 "mov           %3, %%al \n\t"   /* load Tmin into AL */
03841                 "mov         %%al, %%ah \n\t"   /* copy AL into AH */
03842                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
03843                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
03844                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
03845                 "movd      %%eax, %%mm5 \n\t"   /* copy EAX into MM5 */
03846                 "movd      %%eax, %%mm4 \n\t"   /* copy EAX into MM4 */
03847                 "punpckldq %%mm4, %%mm5 \n\t"   /* fill higher bytes of MM5 with Tmin */
03848                 "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */
03849                 "paddusb   %%mm1, %%mm7 \n\t"   /* store 0xFF - Tmax + Tmin in MM7 */
03850                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
03851                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
03852                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
03853                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
03854                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
03855                 "1:                     \n\t" 
03856                 "movq    (%%eax), %%mm0 \n\t"   /* load 8 bytes from Src1 into MM0 */
03857                 "paddusb   %%mm1, %%mm0 \n\t"   /* MM0=SrcDest+(0xFF-Tmax) */
03858                 "psubusb   %%mm7, %%mm0 \n\t"   /* MM0=MM0-(0xFF-Tmax+Tmin) */
03859                 "paddusb   %%mm5, %%mm0 \n\t"   /* MM0=MM0+Tmin */
03860                 "movq    %%mm0, (%%edi) \n\t"   /* store result in Dest */
03861                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
03862                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
03863                 "dec              %%ecx \n\t"   /* decrease loop counter */
03864                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
03865                 "emms                   \n\t"   /* exit MMX state */
03866                 "popa                   \n\t":"=m" (Dest)       /* %0 */
03867                 :"m"(Src1),             /* %1 */
03868                 "m"(SrcLength),         /* %2 */
03869                 "m"(Tmin),              /* %3 */
03870                 "m"(Tmax)                       /* %4 */
03871                 );
03872 #endif
03873         return (0);
03874 #else
03875         return (-1);
03876 #endif
03877 }
03878 
03890 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
03891                                                            unsigned char Tmax)
03892 {
03893         unsigned int i, istart;
03894         unsigned char *cursrc1;
03895         unsigned char *curdest;
03896 
03897         /* Validate input parameters */
03898         if ((Src1 == NULL) || (Dest == NULL))
03899                 return(-1);
03900         if (length == 0)
03901                 return(0);
03902 
03903         /* Special case: Tmin==0 && Tmax = 255 */
03904         if ((Tmin == 0) && (Tmax == 25)) {
03905                 memcpy(Src1, Dest, length);
03906                 return (0); 
03907         }
03908 
03909         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03910 
03911                 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
03912 
03913                 /* Check for unaligned bytes */
03914                 if ((length & 7) > 0) {
03915                         /* Setup to process unaligned bytes */
03916                         istart = length & 0xfffffff8;
03917                         cursrc1 = &Src1[istart];
03918                         curdest = &Dest[istart];
03919                 } else {
03920                         /* No unaligned bytes - we are done */
03921                         return (0);
03922                 }
03923         } else {
03924                 /* Setup to process whole image */
03925                 istart = 0;
03926                 cursrc1 = Src1;
03927                 curdest = Dest;
03928         }
03929 
03930         /* C routine to process image */
03931         for (i = istart; i < length; i++) {
03932                 if (*cursrc1 < Tmin) {
03933                         *curdest = Tmin;
03934                 } else if (*cursrc1 > Tmax) {
03935                         *curdest = Tmax;
03936                 } else {
03937                         *curdest = *cursrc1;
03938                 }
03939                 /* Advance pointers */
03940                 cursrc1++;
03941                 curdest++;
03942         }
03943 
03944         return (0);
03945 }
03946 
03960 int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
03961                                                                           int Nmin, int Nmax)
03962 {
03963 #ifdef USE_MMX
03964 #if !defined(GCC__)
03965         __asm
03966         {
03967                 pusha
03968                         mov ax, WORD PTR Nmax           /* load Nmax in AX */
03969                         mov bx, WORD PTR Cmax           /* load Cmax in BX */
03970                         sub ax, WORD PTR Nmin           /* AX = Nmax - Nmin */
03971                         sub bx, WORD PTR Cmin           /* BX = Cmax - Cmin */
03972                         jz             L10311           /* check division by zero */
03973                         xor dx, dx      /* prepare for division, zero DX */
03974                         div               bx            /* AX = AX/BX */
03975                         jmp            L10312
03976 L10311:
03977                 mov ax, 255     /* if div by zero, assume result max byte value */
03978 L10312:                         /* ** Duplicate AX in 4 words of MM0 ** */
03979                 mov bx, ax      /* copy AX into BX */
03980                         shl eax, 16     /* shift 2 bytes of EAX left */
03981                         mov ax, bx      /* copy BX into AX */
03982                         movd mm0, eax           /* copy EAX into MM0 */
03983                         movd mm1, eax           /* copy EAX into MM1 */
03984                         punpckldq mm0, mm1      /* fill higher words of MM0 with AX */
03985                         /* ** Duplicate Cmin in 4 words of MM1 ** */
03986                         mov ax, WORD PTR Cmin           /* load Cmin into AX */
03987                         mov bx, ax      /* copy AX into BX */
03988                         shl eax, 16     /* shift 2 bytes of EAX left */
03989                         mov ax, bx      /* copy BX into AX */
03990                         movd mm1, eax           /* copy EAX into MM1 */
03991                         movd mm2, eax           /* copy EAX into MM2 */
03992                         punpckldq mm1, mm2      /* fill higher words of MM1 with Cmin */
03993                         /* ** Duplicate Nmin in 4 words of MM2 ** */
03994                         mov ax, WORD PTR Nmin           /* load Nmin into AX */
03995                         mov bx, ax      /* copy AX into BX */
03996                         shl eax, 16     /* shift 2 bytes of EAX left */
03997                         mov ax, bx      /* copy BX into AX */
03998                         movd mm2, eax           /* copy EAX into MM2 */
03999                         movd mm3, eax           /* copy EAX into MM3 */
04000                         punpckldq mm2, mm3      /* fill higher words of MM2 with Nmin */
04001                         pxor mm7, mm7           /* zero MM7 register */
04002                         mov eax, Src1           /* load Src1 address into eax */
04003                         mov edi, Dest           /* load Dest address into edi */
04004                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
04005                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
04006                         align 16                        /* 16 byte alignment of the loop entry */
04007 L1031:
04008                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
04009                 movq mm4, mm3           /* copy MM3 into MM4  */
04010                         punpcklbw mm3, mm7      /* unpack low  bytes of SrcDest into words */
04011                         punpckhbw mm4, mm7      /* unpack high bytes of SrcDest into words */
04012                         psubusb mm3, mm1        /* S-Cmin, low  bytes */
04013                         psubusb mm4, mm1        /* S-Cmin, high bytes */
04014                         pmullw mm3, mm0         /* MM0*(S-Cmin), low  bytes */
04015                         pmullw mm4, mm0         /* MM0*(S-Cmin), high bytes */
04016                         paddusb mm3, mm2        /* MM0*(S-Cmin)+Nmin, low  bytes */
04017                         paddusb mm4, mm2        /* MM0*(S-Cmin)+Nmin, high bytes */
04018                         /* ** Take abs value of the signed words ** */
04019                         movq mm5, mm3           /* copy mm3 into mm5 */
04020                         movq mm6, mm4           /* copy mm4 into mm6 */
04021                         psraw mm5, 15           /* fill mm5 words with word sign bit */
04022                         psraw mm6, 15           /* fill mm6 words with word sign bit */
04023                         pxor mm3, mm5           /* take 1's compliment of only neg words */
04024                         pxor mm4, mm6           /* take 1's compliment of only neg words */
04025                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
04026                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
04027                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
04028                         movq [edi], mm3         /* store result in Dest */
04029                         add eax, 8      /* increase Src1 register pointer by 8 */
04030                         add edi, 8      /* increase Dest register pointer by 8 */
04031                         dec              ecx            /* decrease loop counter */
04032                         jnz             L1031           /* check loop termination, proceed if required */
04033                         emms                            /* exit MMX state */
04034                         popa
04035         }
04036 #else
04037         asm volatile
04038                 ("pusha              \n\t" "mov           %6, %%ax \n\t"        /* load Nmax in AX */
04039                 "mov           %4, %%bx \n\t"   /* load Cmax in BX */
04040                 "sub           %5, %%ax \n\t"   /* AX = Nmax - Nmin */
04041                 "sub           %3, %%bx \n\t"   /* BX = Cmax - Cmin */
04042                 "jz                  1f \n\t"   /* check division by zero */
04043                 "xor         %%dx, %%dx \n\t"   /* prepare for division, zero DX */
04044                 "div               %%bx \n\t"   /* AX = AX/BX */
04045                 "jmp                 2f \n\t" "1:                     \n\t" "mov         $255, %%ax \n\t"       /* if div by zero, assume result max. byte value */
04046                 "2:                    \n\t"    /* ** Duplicate AX in 4 words of MM0 ** */
04047                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
04048                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
04049                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
04050                 "movd      %%eax, %%mm0 \n\t"   /* copy EAX into MM0 */
04051                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
04052                 "punpckldq %%mm1, %%mm0 \n\t"   /* fill higher words of MM0 with AX */
04053                 /* ** Duplicate Cmin in 4 words of MM1 ** */
04054                 "mov           %3, %%ax \n\t"   /* load Cmin into AX */
04055                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
04056                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
04057                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
04058                 "movd      %%eax, %%mm1 \n\t"   /* copy EAX into MM1 */
04059                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
04060                 "punpckldq %%mm2, %%mm1 \n\t"   /* fill higher words of MM1 with Cmin */
04061                 /* ** Duplicate Nmin in 4 words of MM2 ** */
04062                 "mov           %5, %%ax \n\t"   /* load Nmin into AX */
04063                 "mov         %%ax, %%bx \n\t"   /* copy AX into BX */
04064                 "shl         $16, %%eax \n\t"   /* shift 2 bytes of EAX left */
04065                 "mov         %%bx, %%ax \n\t"   /* copy BX into AX */
04066                 "movd      %%eax, %%mm2 \n\t"   /* copy EAX into MM2 */
04067                 "movd      %%eax, %%mm3 \n\t"   /* copy EAX into MM3 */
04068                 "punpckldq %%mm3, %%mm2 \n\t"   /* fill higher words of MM2 with Nmin */
04069                 "pxor      %%mm7, %%mm7 \n\t"   /* zero MM7 register */
04070                 "mov          %1, %%eax \n\t"   /* load Src1 address into eax */
04071                 "mov          %0, %%edi \n\t"   /* load Dest address into edi */
04072                 "mov          %2, %%ecx \n\t"   /* load loop counter (SIZE) into ecx */
04073                 "shr          $3, %%ecx \n\t"   /* counter/8 (MMX loads 8 bytes at a time) */
04074                 ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04075                 "1:                     \n\t" 
04076                 "movq    (%%eax), %%mm3 \n\t"   /* load 8 bytes from Src1 into MM3 */
04077                 "movq      %%mm3, %%mm4 \n\t"   /* copy MM3 into MM4  */
04078                 "punpcklbw %%mm7, %%mm3 \n\t"   /* unpack low  bytes of SrcDest into words */
04079                 "punpckhbw %%mm7, %%mm4 \n\t"   /* unpack high bytes of SrcDest into words */
04080                 "psubusb   %%mm1, %%mm3 \n\t"   /* S-Cmin, low  bytes */
04081                 "psubusb   %%mm1, %%mm4 \n\t"   /* S-Cmin, high bytes */
04082                 "pmullw    %%mm0, %%mm3 \n\t"   /* MM0*(S-Cmin), low  bytes */
04083                 "pmullw    %%mm0, %%mm4 \n\t"   /* MM0*(S-Cmin), high bytes */
04084                 "paddusb   %%mm2, %%mm3 \n\t"   /* MM0*(S-Cmin)+Nmin, low  bytes */
04085                 "paddusb   %%mm2, %%mm4 \n\t"   /* MM0*(S-Cmin)+Nmin, high bytes */
04086                 /* ** Take abs value of the signed words ** */
04087                 "movq      %%mm3, %%mm5 \n\t"   /* copy mm3 into mm5 */
04088                 "movq      %%mm4, %%mm6 \n\t"   /* copy mm4 into mm6 */
04089                 "psraw       $15, %%mm5 \n\t"   /* fill mm5 words with word sign bit */
04090                 "psraw       $15, %%mm6 \n\t"   /* fill mm6 words with word sign bit */
04091                 "pxor      %%mm5, %%mm3 \n\t"   /* take 1's compliment of only neg. words */
04092                 "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
04093                 "psubsw    %%mm5, %%mm3 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
04094                 "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
04095                 "packuswb  %%mm4, %%mm3 \n\t"   /* pack words back into bytes with saturation */
04096                 "movq    %%mm3, (%%edi) \n\t"   /* store result in Dest */
04097                 "add          $8, %%eax \n\t"   /* increase Src1 register pointer by 8 */
04098                 "add          $8, %%edi \n\t"   /* increase Dest register pointer by 8 */
04099                 "dec              %%ecx \n\t"   /* decrease loop counter */
04100                 "jnz                 1b \n\t"   /* check loop termination, proceed if required */
04101                 "emms                   \n\t"   /* exit MMX state */
04102                 "popa                   \n\t":"=m" (Dest)       /* %0 */
04103                 :"m"(Src1),             /* %1 */
04104                 "m"(SrcLength),         /* %2 */
04105                 "m"(Cmin),              /* %3 */
04106                 "m"(Cmax),              /* %4 */
04107                 "m"(Nmin),              /* %5 */
04108                 "m"(Nmax)                       /* %6 */
04109                 );
04110 #endif
04111         return (0);
04112 #else
04113         return (-1);
04114 #endif
04115 }
04116 
04130 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
04131                                                                    int Nmax)
04132 {
04133         unsigned int i, istart;
04134         unsigned char *cursrc;
04135         unsigned char *curdest;
04136         int dN, dC, factor;
04137         int result;
04138 
04139         /* Validate input parameters */
04140         if ((Src == NULL) || (Dest == NULL))
04141                 return(-1);
04142         if (length == 0)
04143                 return(0);
04144 
04145         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
04146 
04147                 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
04148 
04149                 /* Check for unaligned bytes */
04150                 if ((length & 7) > 0) {
04151                         /* Setup to process unaligned bytes */
04152                         istart = length & 0xfffffff8;
04153                         cursrc = &Src[istart];
04154                         curdest = &Dest[istart];
04155                 } else {
04156                         /* No unaligned bytes - we are done */
04157                         return (0);
04158                 }
04159         } else {
04160                 /* Setup to process whole image */
04161                 istart = 0;
04162                 cursrc = Src;
04163                 curdest = Dest;
04164         }
04165 
04166         /* C routine to process image */
04167         dC = Cmax - Cmin;
04168         if (dC == 0)
04169                 return (0);
04170         dN = Nmax - Nmin;
04171         factor = dN / dC;
04172         for (i = istart; i < length; i++) {
04173                 result = factor * ((int) (*cursrc) - Cmin) + Nmin;
04174                 if (result > 255)
04175                         result = 255;
04176                 *curdest = (unsigned char) result;
04177                 /* Advance pointers */
04178                 cursrc++;
04179                 curdest++;
04180         }
04181 
04182         return (0);
04183 }
04184 
04185 /* ------------------------------------------------------------------------------------ */
04186 
04201 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04202                                                                                    signed short *Kernel, unsigned char Divisor)
04203 {
04204         /* Validate input parameters */
04205         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04206                 return(-1);
04207 
04208         if ((columns < 3) || (rows < 3) || (Divisor == 0))
04209                 return (-1);
04210 
04211         if ((SDL_imageFilterMMXdetect())) {
04212 #ifdef USE_MMX
04213 #if !defined(GCC__)
04214                 __asm
04215                 {
04216                         pusha
04217                                 pxor mm0, mm0           /* zero MM0 */
04218                                 xor ebx, ebx    /* zero EBX */
04219                                 mov bl, Divisor         /* load Divisor into BL */
04220                                 mov edx, Kernel         /* load Kernel address into EDX */
04221                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */
04222                         add edx, 8      /* second row              |K0 K1 K2 0| */
04223                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
04224                         add edx, 8      /* third row               |K6 K7 K8 0| */
04225                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */
04226                         /* ---, */
04227                         mov eax, columns        /* load columns into EAX */
04228                                 mov esi, Src    /* ESI = Src row 0 address */
04229                                 mov edi, Dest           /* load Dest address to EDI */
04230                                 add edi, eax    /* EDI = EDI + columns */
04231                                 inc              edi            /* 1 byte offset from the left edge */
04232                                 mov edx, rows           /* initialize ROWS counter */
04233                                 sub edx, 2      /* do not use first and last row */
04234                                 /* ---, */
04235 L10320:
04236                         mov ecx, eax    /* initialize COLUMS counter */
04237                                 sub ecx, 2      /* do not use first and last column */
04238                                 align 16                        /* 16 byte alignment of the loop entry */
04239 L10322:
04240                         /* ---, */
04241                         movq mm1, [esi]         /* load 8 bytes of the image first row */
04242                         add esi, eax    /* move one row below */
04243                                 movq mm2, [esi]         /* load 8 bytes of the image second row */
04244                         add esi, eax    /* move one row below */
04245                                 movq mm3, [esi]         /* load 8 bytes of the image third row */
04246                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */
04247                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */
04248                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */
04249                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */
04250                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */
04251                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */
04252                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */
04253                                 paddsw mm1, mm3         /* add 4 words of the third row and result */
04254                                 movq mm2, mm1           /* copy MM1 into MM2 */
04255                                 psrlq mm1, 32           /* shift 2 left words to the right */
04256                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */
04257                                 movq mm3, mm1           /* copy MM1 into MM3 */
04258                                 psrlq mm1, 16           /* shift 1 left word to the right */
04259                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */
04260                                 /* --, */
04261                                 movd mm2, eax           /* save EAX in MM2 */
04262                                 movd mm3, edx           /* save EDX in MM3 */
04263                                 movd eax, mm1           /* copy MM1 into EAX */
04264                                 psraw mm1, 15           /* spread sign bit of the result */
04265                                 movd edx, mm1           /* fill EDX with a sign bit */
04266                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04267                                 movd mm1, eax           /* move result of division into MM1 */
04268                                 packuswb mm1, mm0       /* pack division result with saturation */
04269                                 movd eax, mm1           /* copy saturated result into EAX */
04270                                 mov [edi], al           /* copy a byte result into Dest */
04271                                 movd edx, mm3           /* restore saved EDX */
04272                                 movd eax, mm2           /* restore saved EAX */
04273                                 /* --, */
04274                                 sub esi, eax    /* move two rows up */
04275                                 sub esi, eax    /* */
04276                                 inc              esi            /* move Src  pointer to the next pixel */
04277                                 inc              edi            /* move Dest pointer to the next pixel */
04278                                 /* ---, */
04279                                 dec              ecx            /* decrease loop counter COLUMNS */
04280                                 jnz            L10322           /* check loop termination, proceed if required */
04281                                 add esi, 2      /* move to the next row in Src */
04282                                 add edi, 2      /* move to the next row in Dest */
04283                                 dec              edx            /* decrease loop counter ROWS */
04284                                 jnz            L10320           /* check loop termination, proceed if required */
04285                                 /* ---, */
04286                                 emms                            /* exit MMX state */
04287                                 popa
04288                 }
04289 #else
04290                 asm volatile
04291                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04292                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04293                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04294                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04295                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */
04296                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */
04297                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
04298                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */
04299                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */
04300                         /* --- */
04301                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04302                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
04303                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04304                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
04305                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
04306                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
04307                         "sub          $2, %%edx \n\t"   /* do not use first and last row */
04308                         /* --- */
04309                         ".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */
04310                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */
04311                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04312                         ".L10322:               \n\t"
04313                         /* --- */
04314                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */
04315                         "add       %%eax, %%esi \n\t"   /* move one row below */
04316                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */
04317                         "add       %%eax, %%esi \n\t"   /* move one row below */
04318                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */
04319                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */
04320                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */
04321                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */
04322                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */
04323                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */
04324                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */
04325                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */
04326                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */
04327                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04328                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */
04329                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */
04330                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */
04331                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */
04332                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */
04333                         /* -- */
04334                         "movd      %%eax, %%mm2 \n\t"   /* save EAX in MM2 */
04335                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04336                         "movd      %%mm1, %%eax \n\t"   /* copy MM1 into EAX */
04337                         "psraw       $15, %%mm1 \n\t"   /* spread sign bit of the result */
04338                         "movd      %%mm1, %%edx \n\t"   /* fill EDX with a sign bit */
04339                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04340                         "movd      %%eax, %%mm1 \n\t"   /* move result of division into MM1 */
04341                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack division result with saturation */
04342                         "movd      %%mm1, %%eax \n\t"   /* copy saturated result into EAX */
04343                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04344                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04345                         "movd      %%mm2, %%eax \n\t"   /* restore saved EAX */
04346                         /* -- */
04347                         "sub       %%eax, %%esi \n\t"   /* move two rows up */
04348                         "sub       %%eax, %%esi \n\t"   /* */
04349                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04350                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04351                         /* --- */
04352                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04353                         "jnz            .L10322 \n\t"   /* check loop termination, proceed if required */
04354                         "add          $2, %%esi \n\t"   /* move to the next row in Src */
04355                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */
04356                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
04357                         "jnz            .L10320 \n\t"   /* check loop termination, proceed if required */
04358                         /* --- */
04359                         "emms                   \n\t"   /* exit MMX state */
04360                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04361                         :"m"(Src),              /* %1 */
04362                         "m"(rows),              /* %2 */
04363                         "m"(columns),           /* %3 */
04364                         "m"(Kernel),            /* %4 */
04365                         "m"(Divisor)            /* %5 */
04366                         );
04367 #endif
04368 #endif
04369                 return (0);
04370         } else {
04371                 /* No non-MMX implementation yet */
04372                 return (-1);
04373         }
04374 }
04375 
04390 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04391                                                                                    signed short *Kernel, unsigned char Divisor)
04392 {
04393         /* Validate input parameters */
04394         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04395                 return(-1);
04396 
04397         if ((columns < 5) || (rows < 5) || (Divisor == 0))
04398                 return (-1);
04399 
04400         if ((SDL_imageFilterMMXdetect())) {
04401 #ifdef USE_MMX
04402 #if !defined(GCC__)
04403                 __asm
04404                 {
04405                         pusha
04406                                 pxor mm0, mm0           /* zero MM0 */
04407                                 xor ebx, ebx    /* zero EBX */
04408                                 mov bl, Divisor         /* load Divisor into BL */
04409                                 movd mm5, ebx           /* copy Divisor into MM5 */
04410                                 mov edx, Kernel         /* load Kernel address into EDX */
04411                                 mov esi, Src    /* load Src  address to ESI */
04412                                 mov edi, Dest           /* load Dest address to EDI */
04413                                 add edi, 2      /* 2 column offset from the left edge */
04414                                 mov eax, columns        /* load columns into EAX */
04415                                 shl eax, 1      /* EAX = columns * 2 */
04416                                 add edi, eax    /* 2 row offset from the top edge */
04417                                 shr eax, 1      /* EAX = columns */
04418                                 mov ebx, rows           /* initialize ROWS counter */
04419                                 sub ebx, 4      /* do not use first 2 and last 2 rows */
04420                                 /* ---, */
04421 L10330:
04422                         mov ecx, eax    /* initialize COLUMNS counter */
04423                                 sub ecx, 4      /* do not use first 2 and last 2 columns */
04424                                 align 16                        /* 16 byte alignment of the loop entry */
04425 L10332:
04426                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04427                                 movd mm6, esi           /* save ESI in MM6 */
04428                                 /* --- 1 */
04429                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04430                         movq mm2, mm1           /* copy MM1 into MM2 */
04431                                 add esi, eax    /* move Src pointer 1 row below */
04432                                 movq mm3, [edx]         /* load 4 words of Kernel */
04433                         add edx, 8      /* move pointer to other 4 words */
04434                                 movq mm4, [edx]         /* load 4 words of Kernel */
04435                         add edx, 8      /* move pointer to other 4 words */
04436                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04437                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04438                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04439                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04440                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04441                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04442                                 /* --- 2 */
04443                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04444                         movq mm2, mm1           /* copy MM1 into MM2 */
04445                                 add esi, eax    /* move Src pointer 1 row below */
04446                                 movq mm3, [edx]         /* load 4 words of Kernel */
04447                         add edx, 8      /* move pointer to other 4 words */
04448                                 movq mm4, [edx]         /* load 4 words of Kernel */
04449                         add edx, 8      /* move pointer to other 4 words */
04450                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04451                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04452                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04453                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04454                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04455                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04456                                 /* --- 3 */
04457                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04458                         movq mm2, mm1           /* copy MM1 into MM2 */
04459                                 add esi, eax    /* move Src pointer 1 row below */
04460                                 movq mm3, [edx]         /* load 4 words of Kernel */
04461                         add edx, 8      /* move pointer to other 4 words */
04462                                 movq mm4, [edx]         /* load 4 words of Kernel */
04463                         add edx, 8      /* move pointer to other 4 words */
04464                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04465                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04466                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04467                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04468                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04469                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04470                                 /* --- 4 */
04471                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04472                         movq mm2, mm1           /* copy MM1 into MM2 */
04473                                 add esi, eax    /* move Src pointer 1 row below */
04474                                 movq mm3, [edx]         /* load 4 words of Kernel */
04475                         add edx, 8      /* move pointer to other 4 words */
04476                                 movq mm4, [edx]         /* load 4 words of Kernel */
04477                         add edx, 8      /* move pointer to other 4 words */
04478                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04479                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04480                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04481                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04482                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04483                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04484                                 /* --- 5 */
04485                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04486                         movq mm2, mm1           /* copy MM1 into MM2 */
04487                                 movq mm3, [edx]         /* load 4 words of Kernel */
04488                         add edx, 8      /* move pointer to other 4 words */
04489                                 movq mm4, [edx]         /* load 4 words of Kernel */
04490                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04491                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04492                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04493                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04494                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04495                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04496                                 /* ---, */
04497                                 movq mm3, mm7           /* copy MM7 into MM3 */
04498                                 psrlq mm7, 32           /* shift 2 left words to the right */
04499                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
04500                                 movq mm2, mm7           /* copy MM7 into MM2 */
04501                                 psrlq mm7, 16           /* shift 1 left word to the right */
04502                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
04503                                 /* ---, */
04504                                 movd mm1, eax           /* save EDX in MM1 */
04505                                 movd mm2, ebx           /* save EDX in MM2 */
04506                                 movd mm3, edx           /* save EDX in MM3 */
04507                                 movd eax, mm7           /* load summation result into EAX */
04508                                 psraw mm7, 15           /* spread sign bit of the result */
04509                                 movd ebx, mm5           /* load Divisor into EBX */
04510                                 movd edx, mm7           /* fill EDX with a sign bit */
04511                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04512                                 movd mm7, eax           /* move result of division into MM7 */
04513                                 packuswb mm7, mm0       /* pack division result with saturation */
04514                                 movd eax, mm7           /* copy saturated result into EAX */
04515                                 mov [edi], al           /* copy a byte result into Dest */
04516                                 movd edx, mm3           /* restore saved EDX */
04517                                 movd ebx, mm2           /* restore saved EBX */
04518                                 movd eax, mm1           /* restore saved EAX */
04519                                 /* --, */
04520                                 movd esi, mm6           /* move Src pointer to the top pixel */
04521                                 sub edx, 72     /* EDX = Kernel address */
04522                                 inc              esi            /* move Src  pointer to the next pixel */
04523                                 inc              edi            /* move Dest pointer to the next pixel */
04524                                 /* ---, */
04525                                 dec              ecx            /* decrease loop counter COLUMNS */
04526                                 jnz            L10332           /* check loop termination, proceed if required */
04527                                 add esi, 4      /* move to the next row in Src */
04528                                 add edi, 4      /* move to the next row in Dest */
04529                                 dec              ebx            /* decrease loop counter ROWS */
04530                                 jnz            L10330           /* check loop termination, proceed if required */
04531                                 /* ---, */
04532                                 emms                            /* exit MMX state */
04533                                 popa
04534                 }
04535 #else
04536                 asm volatile
04537                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04538                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04539                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04540                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
04541                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04542                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
04543                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04544                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */
04545                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04546                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */
04547                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */
04548                         "shr          $1, %%eax \n\t"   /* EAX = columns */
04549                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */
04550                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */
04551                         /* --- */
04552                         ".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
04553                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */
04554                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04555                         ".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
04556                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
04557                         /* --- 1 */
04558                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04559                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04560                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04561                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04562                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04563                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04564                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04565                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04566                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04567                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04568                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04569                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04570                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04571                         /* --- 2 */
04572                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04573                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04574                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04575                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04576                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04577                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04578                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04579                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04580                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04581                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04582                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04583                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04584                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04585                         /* --- 3 */
04586                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04587                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04588                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04589                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04590                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04591                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04592                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04593                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04594                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04595                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04596                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04597                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04598                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04599                         /* --- 4 */
04600                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04601                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04602                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04603                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04604                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04605                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04606                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04607                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04608                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04609                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04610                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04611                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04612                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04613                         /* --- 5 */
04614                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04615                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04616                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04617                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04618                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04619                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04620                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04621                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04622                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04623                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04624                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04625                         /* --- */
04626                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
04627                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
04628                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
04629                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
04630                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
04631                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
04632                         /* --- */
04633                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
04634                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
04635                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04636                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
04637                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
04638                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
04639                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
04640                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04641                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
04642                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
04643                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
04644                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04645                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04646                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
04647                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
04648                         /* -- */
04649                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
04650                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */
04651                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04652                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04653                         /* --- */
04654                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04655                         "jnz            .L10332 \n\t"   /* check loop termination, proceed if required */
04656                         "add          $4, %%esi \n\t"   /* move to the next row in Src */
04657                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */
04658                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
04659                         "jnz            .L10330 \n\t"   /* check loop termination, proceed if required */
04660                         /* --- */
04661                         "emms                   \n\t"   /* exit MMX state */
04662                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04663                         :"m"(Src),              /* %1 */
04664                         "m"(rows),              /* %2 */
04665                         "m"(columns),           /* %3 */
04666                         "m"(Kernel),            /* %4 */
04667                         "m"(Divisor)            /* %5 */
04668                         );
04669 #endif
04670 #endif
04671                 return (0);
04672         } else {
04673                 /* No non-MMX implementation yet */
04674                 return (-1);
04675         }
04676 }
04677 
04692 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04693                                                                                    signed short *Kernel, unsigned char Divisor)
04694 {
04695         /* Validate input parameters */
04696         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04697                 return(-1);
04698 
04699         if ((columns < 7) || (rows < 7) || (Divisor == 0))
04700                 return (-1);
04701 
04702         if ((SDL_imageFilterMMXdetect())) {
04703 #ifdef USE_MMX
04704 #if !defined(GCC__)
04705                 __asm
04706                 {
04707                         pusha
04708                                 pxor mm0, mm0           /* zero MM0 */
04709                                 xor ebx, ebx    /* zero EBX */
04710                                 mov bl, Divisor         /* load Divisor into BL */
04711                                 movd mm5, ebx           /* copy Divisor into MM5 */
04712                                 mov edx, Kernel         /* load Kernel address into EDX */
04713                                 mov esi, Src    /* load Src  address to ESI */
04714                                 mov edi, Dest           /* load Dest address to EDI */
04715                                 add edi, 3      /* 3 column offset from the left edge */
04716                                 mov eax, columns        /* load columns into EAX */
04717                                 add edi, eax    /* 3 row offset from the top edge */
04718                                 add edi, eax
04719                                 add edi, eax
04720                                 mov ebx, rows           /* initialize ROWS counter */
04721                                 sub ebx, 6      /* do not use first 3 and last 3 rows */
04722                                 /* ---, */
04723 L10340:
04724                         mov ecx, eax    /* initialize COLUMNS counter */
04725                                 sub ecx, 6      /* do not use first 3 and last 3 columns */
04726                                 align 16                        /* 16 byte alignment of the loop entry */
04727 L10342:
04728                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04729                                 movd mm6, esi           /* save ESI in MM6 */
04730                                 /* --- 1 */
04731                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04732                         movq mm2, mm1           /* copy MM1 into MM2 */
04733                                 add esi, eax    /* move Src pointer 1 row below */
04734                                 movq mm3, [edx]         /* load 4 words of Kernel */
04735                         add edx, 8      /* move pointer to other 4 words */
04736                                 movq mm4, [edx]         /* load 4 words of Kernel */
04737                         add edx, 8      /* move pointer to other 4 words */
04738                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04739                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04740                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04741                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04742                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04743                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04744                                 /* --- 2 */
04745                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04746                         movq mm2, mm1           /* copy MM1 into MM2 */
04747                                 add esi, eax    /* move Src pointer 1 row below */
04748                                 movq mm3, [edx]         /* load 4 words of Kernel */
04749                         add edx, 8      /* move pointer to other 4 words */
04750                                 movq mm4, [edx]         /* load 4 words of Kernel */
04751                         add edx, 8      /* move pointer to other 4 words */
04752                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04753                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04754                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04755                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04756                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04757                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04758                                 /* --- 3 */
04759                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04760                         movq mm2, mm1           /* copy MM1 into MM2 */
04761                                 add esi, eax    /* move Src pointer 1 row below */
04762                                 movq mm3, [edx]         /* load 4 words of Kernel */
04763                         add edx, 8      /* move pointer to other 4 words */
04764                                 movq mm4, [edx]         /* load 4 words of Kernel */
04765                         add edx, 8      /* move pointer to other 4 words */
04766                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04767                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04768                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04769                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04770                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04771                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04772                                 /* --- 4 */
04773                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04774                         movq mm2, mm1           /* copy MM1 into MM2 */
04775                                 add esi, eax    /* move Src pointer 1 row below */
04776                                 movq mm3, [edx]         /* load 4 words of Kernel */
04777                         add edx, 8      /* move pointer to other 4 words */
04778                                 movq mm4, [edx]         /* load 4 words of Kernel */
04779                         add edx, 8      /* move pointer to other 4 words */
04780                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04781                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04782                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04783                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04784                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04785                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04786                                 /* --- 5 */
04787                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04788                         movq mm2, mm1           /* copy MM1 into MM2 */
04789                                 add esi, eax    /* move Src pointer 1 row below */
04790                                 movq mm3, [edx]         /* load 4 words of Kernel */
04791                         add edx, 8      /* move pointer to other 4 words */
04792                                 movq mm4, [edx]         /* load 4 words of Kernel */
04793                         add edx, 8      /* move pointer to other 4 words */
04794                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04795                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04796                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04797                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04798                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04799                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04800                                 /* --- 6 */
04801                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04802                         movq mm2, mm1           /* copy MM1 into MM2 */
04803                                 add esi, eax    /* move Src pointer 1 row below */
04804                                 movq mm3, [edx]         /* load 4 words of Kernel */
04805                         add edx, 8      /* move pointer to other 4 words */
04806                                 movq mm4, [edx]         /* load 4 words of Kernel */
04807                         add edx, 8      /* move pointer to other 4 words */
04808                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04809                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04810                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04811                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04812                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04813                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04814                                 /* --- 7 */
04815                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04816                         movq mm2, mm1           /* copy MM1 into MM2 */
04817                                 movq mm3, [edx]         /* load 4 words of Kernel */
04818                         add edx, 8      /* move pointer to other 4 words */
04819                                 movq mm4, [edx]         /* load 4 words of Kernel */
04820                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04821                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04822                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04823                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04824                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04825                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04826                                 /* ---, */
04827                                 movq mm3, mm7           /* copy MM7 into MM3 */
04828                                 psrlq mm7, 32           /* shift 2 left words to the right */
04829                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
04830                                 movq mm2, mm7           /* copy MM7 into MM2 */
04831                                 psrlq mm7, 16           /* shift 1 left word to the right */
04832                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
04833                                 /* ---, */
04834                                 movd mm1, eax           /* save EDX in MM1 */
04835                                 movd mm2, ebx           /* save EDX in MM2 */
04836                                 movd mm3, edx           /* save EDX in MM3 */
04837                                 movd eax, mm7           /* load summation result into EAX */
04838                                 psraw mm7, 15           /* spread sign bit of the result */
04839                                 movd ebx, mm5           /* load Divisor into EBX */
04840                                 movd edx, mm7           /* fill EDX with a sign bit */
04841                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04842                                 movd mm7, eax           /* move result of division into MM7 */
04843                                 packuswb mm7, mm0       /* pack division result with saturation */
04844                                 movd eax, mm7           /* copy saturated result into EAX */
04845                                 mov [edi], al           /* copy a byte result into Dest */
04846                                 movd edx, mm3           /* restore saved EDX */
04847                                 movd ebx, mm2           /* restore saved EBX */
04848                                 movd eax, mm1           /* restore saved EAX */
04849                                 /* --, */
04850                                 movd esi, mm6           /* move Src pointer to the top pixel */
04851                                 sub edx, 104    /* EDX = Kernel address */
04852                                 inc              esi            /* move Src  pointer to the next pixel */
04853                                 inc              edi            /* move Dest pointer to the next pixel */
04854                                 /* ---, */
04855                                 dec              ecx            /* decrease loop counter COLUMNS */
04856                                 jnz            L10342           /* check loop termination, proceed if required */
04857                                 add esi, 6      /* move to the next row in Src */
04858                                 add edi, 6      /* move to the next row in Dest */
04859                                 dec              ebx            /* decrease loop counter ROWS */
04860                                 jnz            L10340           /* check loop termination, proceed if required */
04861                                 /* ---, */
04862                                 emms                            /* exit MMX state */
04863                                 popa
04864                 }
04865 #else
04866                 asm volatile
04867                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04868                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04869                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04870                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
04871                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04872                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
04873                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04874                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */
04875                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04876                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */
04877                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */
04878                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */
04879                         /* --- */
04880                         ".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
04881                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */
04882                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04883                         ".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
04884                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
04885                         /* --- 1 */
04886                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04887                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04888                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04889                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04890                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04891                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04892                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04893                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04894                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04895                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04896                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04897                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04898                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04899                         /* --- 2 */
04900                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04901                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04902                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04903                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04904                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04905                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04906                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04907                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04908                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04909                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04910                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04911                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04912                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04913                         /* --- 3 */
04914                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04915                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04916                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04917                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04918                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04919                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04920                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04921                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04922                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04923                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04924                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04925                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04926                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04927                         /* --- 4 */
04928                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04929                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04930                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04931                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04932                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04933                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04934                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04935                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04936                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04937                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04938                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04939                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04940                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04941                         /* --- 5 */
04942                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04943                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04944                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04945                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04946                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04947                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04948                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04949                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04950                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04951                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04952                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04953                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04954                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04955                         /* --- 6 */
04956                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04957                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04958                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04959                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04960                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04961                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04962                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04963                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04964                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04965                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04966                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04967                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04968                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04969                         /* --- 7 */
04970                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04971                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04972                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04973                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04974                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04975                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04976                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04977                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04978                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04979                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04980                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04981                         /* --- */
04982                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
04983                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
04984                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
04985                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
04986                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
04987                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
04988                         /* --- */
04989                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
04990                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
04991                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04992                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
04993                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
04994                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
04995                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
04996                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04997                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
04998                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
04999                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
05000                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
05001                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
05002                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
05003                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
05004                         /* -- */
05005                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
05006                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */
05007                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
05008                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05009                         /* --- */
05010                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05011                         "jnz            .L10342 \n\t"   /* check loop termination, proceed if required */
05012                         "add          $6, %%esi \n\t"   /* move to the next row in Src */
05013                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */
05014                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
05015                         "jnz            .L10340 \n\t"   /* check loop termination, proceed if required */
05016                         /* --- */
05017                         "emms                   \n\t"   /* exit MMX state */
05018                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05019                         :"m"(Src),              /* %1 */
05020                         "m"(rows),              /* %2 */
05021                         "m"(columns),           /* %3 */
05022                         "m"(Kernel),            /* %4 */
05023                         "m"(Divisor)            /* %5 */
05024                         );
05025 #endif
05026 #endif
05027                 return (0);
05028         } else {
05029                 /* No non-MMX implementation yet */
05030                 return (-1);
05031         }
05032 }
05033 
05048 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05049                                                                                    signed short *Kernel, unsigned char Divisor)
05050 {
05051         /* Validate input parameters */
05052         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05053                 return(-1);
05054 
05055         if ((columns < 9) || (rows < 9) || (Divisor == 0))
05056                 return (-1);
05057 
05058         if ((SDL_imageFilterMMXdetect())) {
05059 #ifdef USE_MMX
05060 #if !defined(GCC__)
05061                 __asm
05062                 {
05063                         pusha
05064                                 pxor mm0, mm0           /* zero MM0 */
05065                                 xor ebx, ebx    /* zero EBX */
05066                                 mov bl, Divisor         /* load Divisor into BL */
05067                                 movd mm5, ebx           /* copy Divisor into MM5 */
05068                                 mov edx, Kernel         /* load Kernel address into EDX */
05069                                 mov esi, Src    /* load Src  address to ESI */
05070                                 mov edi, Dest           /* load Dest address to EDI */
05071                                 add edi, 4      /* 4 column offset from the left edge */
05072                                 mov eax, columns        /* load columns into EAX */
05073                                 add edi, eax    /* 4 row offset from the top edge */
05074                                 add edi, eax
05075                                 add edi, eax
05076                                 add edi, eax
05077                                 mov ebx, rows           /* initialize ROWS counter */
05078                                 sub ebx, 8      /* do not use first 4 and last 4 rows */
05079                                 /* ---, */
05080 L10350:
05081                         mov ecx, eax    /* initialize COLUMNS counter */
05082                                 sub ecx, 8      /* do not use first 4 and last 4 columns */
05083                                 align 16                        /* 16 byte alignment of the loop entry */
05084 L10352:
05085                         pxor mm7, mm7           /* zero MM7 (accumulator) */
05086                                 movd mm6, esi           /* save ESI in MM6 */
05087                                 /* --- 1 */
05088                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05089                         movq mm2, mm1           /* copy MM1 into MM2 */
05090                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05091                                 movq mm3, [edx]         /* load 4 words of Kernel */
05092                         add edx, 8      /* move pointer to other 4 words */
05093                                 movq mm4, [edx]         /* load 4 words of Kernel */
05094                         add edx, 8      /* move pointer to other 4 words */
05095                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05096                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05097                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05098                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05099                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05100                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05101                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05102                         dec              esi
05103                                 add esi, eax    /* move Src pointer 1 row below */
05104                                 movq mm3, [edx]         /* load 4 words of Kernel */
05105                         add edx, 8      /* move pointer to other 4 words */
05106                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05107                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05108                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05109                                 /* --- 2 */
05110                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05111                         movq mm2, mm1           /* copy MM1 into MM2 */
05112                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05113                                 movq mm3, [edx]         /* load 4 words of Kernel */
05114                         add edx, 8      /* move pointer to other 4 words */
05115                                 movq mm4, [edx]         /* load 4 words of Kernel */
05116                         add edx, 8      /* move pointer to other 4 words */
05117                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05118                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05119                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05120                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05121                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05122                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05123                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05124                         dec              esi
05125                                 add esi, eax    /* move Src pointer 1 row below */
05126                                 movq mm3, [edx]         /* load 4 words of Kernel */
05127                         add edx, 8      /* move pointer to other 4 words */
05128                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05129                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05130                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05131                                 /* --- 3 */
05132                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05133                         movq mm2, mm1           /* copy MM1 into MM2 */
05134                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05135                                 movq mm3, [edx]         /* load 4 words of Kernel */
05136                         add edx, 8      /* move pointer to other 4 words */
05137                                 movq mm4, [edx]         /* load 4 words of Kernel */
05138                         add edx, 8      /* move pointer to other 4 words */
05139                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05140                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05141                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05142                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05143                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05144                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05145                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05146                         dec              esi
05147                                 add esi, eax    /* move Src pointer 1 row below */
05148                                 movq mm3, [edx]         /* load 4 words of Kernel */
05149                         add edx, 8      /* move pointer to other 4 words */
05150                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05151                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05152                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05153                                 /* --- 4 */
05154                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05155                         movq mm2, mm1           /* copy MM1 into MM2 */
05156                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05157                                 movq mm3, [edx]         /* load 4 words of Kernel */
05158                         add edx, 8      /* move pointer to other 4 words */
05159                                 movq mm4, [edx]         /* load 4 words of Kernel */
05160                         add edx, 8      /* move pointer to other 4 words */
05161                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05162                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05163                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05164                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05165                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05166                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05167                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05168                         dec              esi
05169                                 add esi, eax    /* move Src pointer 1 row below */
05170                                 movq mm3, [edx]         /* load 4 words of Kernel */
05171                         add edx, 8      /* move pointer to other 4 words */
05172                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05173                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05174                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05175                                 /* --- 5 */
05176                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05177                         movq mm2, mm1           /* copy MM1 into MM2 */
05178                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05179                                 movq mm3, [edx]         /* load 4 words of Kernel */
05180                         add edx, 8      /* move pointer to other 4 words */
05181                                 movq mm4, [edx]         /* load 4 words of Kernel */
05182                         add edx, 8      /* move pointer to other 4 words */
05183                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05184                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05185                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05186                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05187                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05188                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05189                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05190                         dec              esi
05191                                 add esi, eax    /* move Src pointer 1 row below */
05192                                 movq mm3, [edx]         /* load 4 words of Kernel */
05193                         add edx, 8      /* move pointer to other 4 words */
05194                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05195                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05196                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05197                                 /* --- 6 */
05198                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05199                         movq mm2, mm1           /* copy MM1 into MM2 */
05200                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05201                                 movq mm3, [edx]         /* load 4 words of Kernel */
05202                         add edx, 8      /* move pointer to other 4 words */
05203                                 movq mm4, [edx]         /* load 4 words of Kernel */
05204                         add edx, 8      /* move pointer to other 4 words */
05205                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05206                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05207                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05208                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05209                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05210                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05211                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05212                         dec              esi
05213                                 add esi, eax    /* move Src pointer 1 row below */
05214                                 movq mm3, [edx]         /* load 4 words of Kernel */
05215                         add edx, 8      /* move pointer to other 4 words */
05216                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05217                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05218                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05219                                 /* --- 7 */
05220                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05221                         movq mm2, mm1           /* copy MM1 into MM2 */
05222                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05223                                 movq mm3, [edx]         /* load 4 words of Kernel */
05224                         add edx, 8      /* move pointer to other 4 words */
05225                                 movq mm4, [edx]         /* load 4 words of Kernel */
05226                         add edx, 8      /* move pointer to other 4 words */
05227                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05228                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05229                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05230                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05231                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05232                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05233                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05234                         dec              esi
05235                                 add esi, eax    /* move Src pointer 1 row below */
05236                                 movq mm3, [edx]         /* load 4 words of Kernel */
05237                         add edx, 8      /* move pointer to other 4 words */
05238                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05239                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05240                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05241                                 /* --- 8 */
05242                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05243                         movq mm2, mm1           /* copy MM1 into MM2 */
05244                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05245                                 movq mm3, [edx]         /* load 4 words of Kernel */
05246                         add edx, 8      /* move pointer to other 4 words */
05247                                 movq mm4, [edx]         /* load 4 words of Kernel */
05248                         add edx, 8      /* move pointer to other 4 words */
05249                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05250                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05251                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05252                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05253                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05254                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05255                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05256                         dec              esi
05257                                 add esi, eax    /* move Src pointer 1 row below */
05258                                 movq mm3, [edx]         /* load 4 words of Kernel */
05259                         add edx, 8      /* move pointer to other 4 words */
05260                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05261                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05262                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05263                                 /* --- 9 */
05264                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05265                         movq mm2, mm1           /* copy MM1 into MM2 */
05266                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05267                                 movq mm3, [edx]         /* load 4 words of Kernel */
05268                         add edx, 8      /* move pointer to other 4 words */
05269                                 movq mm4, [edx]         /* load 4 words of Kernel */
05270                         add edx, 8      /* move pointer to other 4 words */
05271                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05272                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05273                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05274                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05275                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05276                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05277                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05278                         movq mm3, [edx]         /* load 4 words of Kernel */
05279                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05280                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05281                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05282                                 /* ---, */
05283                                 movq mm3, mm7           /* copy MM7 into MM3 */
05284                                 psrlq mm7, 32           /* shift 2 left words to the right */
05285                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
05286                                 movq mm2, mm7           /* copy MM7 into MM2 */
05287                                 psrlq mm7, 16           /* shift 1 left word to the right */
05288                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
05289                                 /* ---, */
05290                                 movd mm1, eax           /* save EDX in MM1 */
05291                                 movd mm2, ebx           /* save EDX in MM2 */
05292                                 movd mm3, edx           /* save EDX in MM3 */
05293                                 movd eax, mm7           /* load summation result into EAX */
05294                                 psraw mm7, 15           /* spread sign bit of the result */
05295                                 movd ebx, mm5           /* load Divisor into EBX */
05296                                 movd edx, mm7           /* fill EDX with a sign bit */
05297                                 idiv bx         /* IDIV - VERY EXPENSIVE */
05298                                 movd mm7, eax           /* move result of division into MM7 */
05299                                 packuswb mm7, mm0       /* pack division result with saturation */
05300                                 movd eax, mm7           /* copy saturated result into EAX */
05301                                 mov [edi], al           /* copy a byte result into Dest */
05302                                 movd edx, mm3           /* restore saved EDX */
05303                                 movd ebx, mm2           /* restore saved EBX */
05304                                 movd eax, mm1           /* restore saved EAX */
05305                                 /* --, */
05306                                 movd esi, mm6           /* move Src pointer to the top pixel */
05307                                 sub edx, 208    /* EDX = Kernel address */
05308                                 inc              esi            /* move Src  pointer to the next pixel */
05309                                 inc              edi            /* move Dest pointer to the next pixel */
05310                                 /* ---, */
05311                                 dec              ecx            /* decrease loop counter COLUMNS */
05312                                 jnz            L10352           /* check loop termination, proceed if required */
05313                                 add esi, 8      /* move to the next row in Src */
05314                                 add edi, 8      /* move to the next row in Dest */
05315                                 dec              ebx            /* decrease loop counter ROWS */
05316                                 jnz            L10350           /* check loop termination, proceed if required */
05317                                 /* ---, */
05318                                 emms                            /* exit MMX state */
05319                                 popa
05320                 }
05321 #else
05322                 asm volatile
05323                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05324                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05325                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
05326                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
05327                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05328                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
05329                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05330                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */
05331                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05332                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */
05333                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */
05334                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */
05335                         /* --- */
05336                         ".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
05337                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */
05338                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05339                         ".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
05340                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
05341                         /* --- 1 */
05342                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05343                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05344                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05345                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05346                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05347                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05348                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05349                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05350                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05351                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05352                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05353                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05354                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05355                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05356                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05357                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05358                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05359                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05360                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05361                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05362                         /* --- 2 */
05363                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05364                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05365                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05366                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05367                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05368                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05369                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05370                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05371                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05372                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05373                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05374                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05375                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05376                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05377                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05378                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05379                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05380                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05381                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05382                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05383                         /* --- 3 */
05384                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05385                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05386                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05387                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05388                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05389                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05390                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05391                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05392                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05393                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05394                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05395                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05396                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05397                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05398                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05399                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05400                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05401                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05402                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05403                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05404                         /* --- 4 */
05405                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05406                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05407                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05408                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05409                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05410                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05411                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05412                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05413                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05414                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05415                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05416                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05417                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05418                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05419                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05420                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05421                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05422                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05423                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05424                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05425                         /* --- 5 */
05426                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05427                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05428                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05429                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05430                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05431                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05432                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05433                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05434                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05435                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05436                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05437                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05438                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05439                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05440                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05441                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05442                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05443                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05444                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05445                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05446                         /* --- 6 */
05447                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05448                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05449                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05450                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05451                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05452                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05453                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05454                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05455                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05456                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05457                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05458                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05459                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05460                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05461                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05462                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05463                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05464                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05465                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05466                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05467                         /* --- 7 */
05468                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05469                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05470                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05471                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05472                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05473                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05474                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05475                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05476                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05477                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05478                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05479                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05480                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05481                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05482                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05483                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05484                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05485                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05486                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05487                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05488                         /* --- 8 */
05489                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05490                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05491                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05492                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05493                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05494                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05495                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05496                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05497                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05498                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05499                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05500                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05501                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05502                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05503                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05504                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05505                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05506                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05507                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05508                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05509                         /* --- 9 */
05510                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05511                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05512                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05513                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05514                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05515                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05516                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05517                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05518                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05519                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05520                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05521                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05522                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05523                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05524                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05525                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05526                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05527                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05528                         /* --- */
05529                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
05530                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
05531                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
05532                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
05533                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
05534                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
05535                         /* --- */
05536                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
05537                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
05538                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
05539                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
05540                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
05541                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
05542                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
05543                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
05544                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
05545                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
05546                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
05547                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
05548                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
05549                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
05550                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
05551                         /* -- */
05552                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
05553                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */
05554                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
05555                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05556                         /* --- */
05557                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05558                         "jnz            .L10352 \n\t"   /* check loop termination, proceed if required */
05559                         "add          $8, %%esi \n\t"   /* move to the next row in Src */
05560                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */
05561                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
05562                         "jnz            .L10350 \n\t"   /* check loop termination, proceed if required */
05563                         /* --- */
05564                         "emms                   \n\t"   /* exit MMX state */
05565                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05566                         :"m"(Src),              /* %1 */
05567                         "m"(rows),              /* %2 */
05568                         "m"(columns),           /* %3 */
05569                         "m"(Kernel),            /* %4 */
05570                         "m"(Divisor)            /* %5 */
05571                         );
05572 #endif
05573 #endif
05574                 return (0);
05575         } else {
05576                 /* No non-MMX implementation yet */
05577                 return (-1);
05578         }
05579 }
05580 
05595 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05596                                                                                            signed short *Kernel, unsigned char NRightShift)
05597 {
05598         /* Validate input parameters */
05599         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05600                 return(-1);
05601 
05602         if ((columns < 3) || (rows < 3) || (NRightShift > 7))
05603                 return (-1);
05604 
05605         if ((SDL_imageFilterMMXdetect())) {
05606 #ifdef USE_MMX
05607 #if !defined(GCC__)
05608                 __asm
05609                 {
05610                         pusha
05611                                 pxor mm0, mm0           /* zero MM0 */
05612                                 xor ebx, ebx    /* zero EBX */
05613                                 mov bl, NRightShift     /* load NRightShift into BL */
05614                                 movd mm4, ebx           /* copy NRightShift into MM4 */
05615                                 mov edx, Kernel         /* load Kernel address into EDX */
05616                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */
05617                         add edx, 8      /* second row              |K0 K1 K2 0| */
05618                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
05619                         add edx, 8      /* third row               |K6 K7 K8 0| */
05620                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */
05621                         /* ---, */
05622                         mov eax, columns        /* load columns into EAX */
05623                                 mov esi, Src    /* ESI = Src row 0 address */
05624                                 mov edi, Dest           /* load Dest address to EDI */
05625                                 add edi, eax    /* EDI = EDI + columns */
05626                                 inc              edi            /* 1 byte offset from the left edge */
05627                                 mov edx, rows           /* initialize ROWS counter */
05628                                 sub edx, 2      /* do not use first and last row */
05629                                 /* ---, */
05630 L10360:
05631                         mov ecx, eax    /* initialize COLUMS counter */
05632                                 sub ecx, 2      /* do not use first and last column */
05633                                 align 16                        /* 16 byte alignment of the loop entry */
05634 L10362:
05635                         /* ---, */
05636                         movq mm1, [esi]         /* load 8 bytes of the image first row */
05637                         add esi, eax    /* move one row below */
05638                                 movq mm2, [esi]         /* load 8 bytes of the image second row */
05639                         add esi, eax    /* move one row below */
05640                                 movq mm3, [esi]         /* load 8 bytes of the image third row */
05641                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */
05642                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */
05643                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */
05644                                 psrlw mm1, mm4          /* shift right each pixel NshiftRight times */
05645                                 psrlw mm2, mm4          /* shift right each pixel NshiftRight times */
05646                                 psrlw mm3, mm4          /* shift right each pixel NshiftRight times */
05647                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */
05648                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */
05649                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */
05650                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */
05651                                 paddsw mm1, mm3         /* add 4 words of the third row and result */
05652                                 movq mm2, mm1           /* copy MM1 into MM2 */
05653                                 psrlq mm1, 32           /* shift 2 left words to the right */
05654                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */
05655                                 movq mm3, mm1           /* copy MM1 into MM3 */
05656                                 psrlq mm1, 16           /* shift 1 left word to the right */
05657                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */
05658                                 packuswb mm1, mm0       /* pack shift result with saturation */
05659                                 movd ebx, mm1           /* copy saturated result into EBX */
05660                                 mov [edi], bl           /* copy a byte result into Dest */
05661                                 /* --, */
05662                                 sub esi, eax    /* move two rows up */
05663                                 sub esi, eax
05664                                 inc              esi            /* move Src  pointer to the next pixel */
05665                                 inc              edi            /* move Dest pointer to the next pixel */
05666                                 /* ---, */
05667                                 dec              ecx            /* decrease loop counter COLUMNS */
05668                                 jnz            L10362           /* check loop termination, proceed if required */
05669                                 add esi, 2      /* move to the next row in Src */
05670                                 add edi, 2      /* move to the next row in Dest */
05671                                 dec              edx            /* decrease loop counter ROWS */
05672                                 jnz            L10360           /* check loop termination, proceed if required */
05673                                 /* ---, */
05674                                 emms                            /* exit MMX state */
05675                                 popa
05676                 }
05677 #else
05678                 asm volatile
05679                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05680                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05681                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
05682                         "movd      %%ebx, %%mm4 \n\t"   /* copy NRightShift into MM4 */
05683                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05684                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */
05685                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */
05686                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
05687                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */
05688                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */
05689                         /* --- */
05690                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05691                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
05692                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05693                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
05694                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
05695                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
05696                         "sub          $2, %%edx \n\t"   /* do not use first and last row */
05697                         /* --- */
05698                         ".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */
05699                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */
05700                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05701                         ".L10362:               \n\t"
05702                         /* --- */
05703                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */
05704                         "add       %%eax, %%esi \n\t"   /* move one row below */
05705                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */
05706                         "add       %%eax, %%esi \n\t"   /* move one row below */
05707                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */
05708                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */
05709                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */
05710                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */
05711                         "psrlw     %%mm4, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05712                         "psrlw     %%mm4, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05713                         "psrlw     %%mm4, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
05714                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */
05715                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */
05716                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */
05717                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */
05718                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */
05719                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05720                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */
05721                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */
05722                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */
05723                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */
05724                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */
05725                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack shift result with saturation */
05726                         "movd      %%mm1, %%ebx \n\t"   /* copy saturated result into EBX */
05727                         "mov      %%bl, (%%edi) \n\t"   /* copy a byte result into Dest */
05728                         /* -- */
05729                         "sub       %%eax, %%esi \n\t"   /* move two rows up */
05730                         "sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"     /* move Src  pointer to the next pixel */
05731                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05732                         /* --- */
05733                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05734                         "jnz            .L10362 \n\t"   /* check loop termination, proceed if required */
05735                         "add          $2, %%esi \n\t"   /* move to the next row in Src */
05736                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */
05737                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
05738                         "jnz            .L10360 \n\t"   /* check loop termination, proceed if required */
05739                         /* --- */
05740                         "emms                   \n\t"   /* exit MMX state */
05741                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05742                         :"m"(Src),              /* %1 */
05743                         "m"(rows),              /* %2 */
05744                         "m"(columns),           /* %3 */
05745                         "m"(Kernel),            /* %4 */
05746                         "m"(NRightShift)        /* %5 */
05747                         );
05748 #endif
05749 #endif
05750                 return (0);
05751         } else {
05752                 /* No non-MMX implementation yet */
05753                 return (-1);
05754         }
05755 }
05756 
05771 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05772                                                                                            signed short *Kernel, unsigned char NRightShift)
05773 {
05774         /* Validate input parameters */
05775         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05776                 return(-1);
05777 
05778         if ((columns < 5) || (rows < 5) || (NRightShift > 7))
05779                 return (-1);
05780 
05781         if ((SDL_imageFilterMMXdetect())) {
05782 #ifdef USE_MMX
05783 #if !defined(GCC__)
05784                 __asm
05785                 {
05786                         pusha
05787                                 pxor mm0, mm0           /* zero MM0 */
05788                                 xor ebx, ebx    /* zero EBX */
05789                                 mov bl, NRightShift     /* load NRightShift into BL */
05790                                 movd mm5, ebx           /* copy NRightShift into MM5 */
05791                                 mov edx, Kernel         /* load Kernel address into EDX */
05792                                 mov esi, Src    /* load Src  address to ESI */
05793                                 mov edi, Dest           /* load Dest address to EDI */
05794                                 add edi, 2      /* 2 column offset from the left edge */
05795                                 mov eax, columns        /* load columns into EAX */
05796                                 shl eax, 1      /* EAX = columns * 2 */
05797                                 add edi, eax    /* 2 row offset from the top edge */
05798                                 shr eax, 1      /* EAX = columns */
05799                                 mov ebx, rows           /* initialize ROWS counter */
05800                                 sub ebx, 4      /* do not use first 2 and last 2 rows */
05801                                 /* ---, */
05802 L10370:
05803                         mov ecx, eax    /* initialize COLUMNS counter */
05804                                 sub ecx, 4      /* do not use first 2 and last 2 columns */
05805                                 align 16                        /* 16 byte alignment of the loop entry */
05806 L10372:
05807                         pxor mm7, mm7           /* zero MM7 (accumulator) */
05808                                 movd mm6, esi           /* save ESI in MM6 */
05809                                 /* --- 1 */
05810                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05811                         movq mm2, mm1           /* copy MM1 into MM2 */
05812                                 add esi, eax    /* move Src pointer 1 row below */
05813                                 movq mm3, [edx]         /* load 4 words of Kernel */
05814                         add edx, 8      /* move pointer to other 4 words */
05815                                 movq mm4, [edx]         /* load 4 words of Kernel */
05816                         add edx, 8      /* move pointer to other 4 words */
05817                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05818                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05819                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05820                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05821                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05822                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05823                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05824                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05825                                 /* --- 2 */
05826                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05827                         movq mm2, mm1           /* copy MM1 into MM2 */
05828                                 add esi, eax    /* move Src pointer 1 row below */
05829                                 movq mm3, [edx]         /* load 4 words of Kernel */
05830                         add edx, 8      /* move pointer to other 4 words */
05831                                 movq mm4, [edx]         /* load 4 words of Kernel */
05832                         add edx, 8      /* move pointer to other 4 words */
05833                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05834                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05835                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05836                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05837                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05838                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05839                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05840                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05841                                 /* --- 3 */
05842                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05843                         movq mm2, mm1           /* copy MM1 into MM2 */
05844                                 add esi, eax    /* move Src pointer 1 row below */
05845                                 movq mm3, [edx]         /* load 4 words of Kernel */
05846                         add edx, 8      /* move pointer to other 4 words */
05847                                 movq mm4, [edx]         /* load 4 words of Kernel */
05848                         add edx, 8      /* move pointer to other 4 words */
05849                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05850                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05851                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05852                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05853                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05854                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05855                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05856                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05857                                 /* --- 4 */
05858                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05859                         movq mm2, mm1           /* copy MM1 into MM2 */
05860                                 add esi, eax    /* move Src pointer 1 row below */
05861                                 movq mm3, [edx]         /* load 4 words of Kernel */
05862                         add edx, 8      /* move pointer to other 4 words */
05863                                 movq mm4, [edx]         /* load 4 words of Kernel */
05864                         add edx, 8      /* move pointer to other 4 words */
05865                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05866                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05867                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05868                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05869                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05870                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05871                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05872                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05873                                 /* --- 5 */
05874                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05875                         movq mm2, mm1           /* copy MM1 into MM2 */
05876                                 movq mm3, [edx]         /* load 4 words of Kernel */
05877                         add edx, 8      /* move pointer to other 4 words */
05878                                 movq mm4, [edx]         /* load 4 words of Kernel */
05879                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05880                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05881                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05882                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05883                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05884                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05885                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05886                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05887                                 /* ---, */
05888                                 movq mm3, mm7           /* copy MM7 into MM3 */
05889                                 psrlq mm7, 32           /* shift 2 left words to the right */
05890                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
05891                                 movq mm2, mm7           /* copy MM7 into MM2 */
05892                                 psrlq mm7, 16           /* shift 1 left word to the right */
05893                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
05894                                 movd mm1, eax           /* save EAX in MM1 */
05895                                 packuswb mm7, mm0       /* pack division result with saturation */
05896                                 movd eax, mm7           /* copy saturated result into EAX */
05897                                 mov [edi], al           /* copy a byte result into Dest */
05898                                 movd eax, mm1           /* restore saved EAX */
05899                                 /* --, */
05900                                 movd esi, mm6           /* move Src pointer to the top pixel */
05901                                 sub edx, 72     /* EDX = Kernel address */
05902                                 inc              esi            /* move Src  pointer to the next pixel */
05903                                 inc              edi            /* move Dest pointer to the next pixel */
05904                                 /* ---, */
05905                                 dec              ecx            /* decrease loop counter COLUMNS */
05906                                 jnz            L10372           /* check loop termination, proceed if required */
05907                                 add esi, 4      /* move to the next row in Src */
05908                                 add edi, 4      /* move to the next row in Dest */
05909                                 dec              ebx            /* decrease loop counter ROWS */
05910                                 jnz            L10370           /* check loop termination, proceed if required */
05911                                 /* ---, */
05912                                 emms                            /* exit MMX state */
05913                                 popa
05914                 }
05915 #else
05916                 asm volatile
05917                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05918                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05919                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
05920                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
05921                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05922                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
05923                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05924                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */
05925                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05926                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */
05927                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */
05928                         "shr          $1, %%eax \n\t"   /* EAX = columns */
05929                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */
05930                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */
05931                         /* --- */
05932                         ".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
05933                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */
05934                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05935                         ".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
05936                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
05937                         /* --- 1 */
05938                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05939                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05940                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05941                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05942                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05943                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05944                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05945                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05946                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05947                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05948                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05949                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05950                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05951                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05952                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05953                         /* --- 2 */
05954                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05955                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05956                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05957                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05958                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05959                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05960                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05961                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05962                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05963                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05964                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05965                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05966                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05967                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05968                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05969                         /* --- 3 */
05970                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05971                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05972                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05973                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05974                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05975                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05976                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05977                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05978                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05979                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05980                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05981                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05982                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05983                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05984                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05985                         /* --- 4 */
05986                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05987                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05988                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05989                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05990                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05991                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05992                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05993                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05994                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05995                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05996                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05997                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05998                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05999                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06000                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06001                         /* --- 5 */
06002                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06003                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06004                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06005                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06006                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06007                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06008                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06009                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06010                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06011                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06012                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06013                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06014                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06015                         /* --- */
06016                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06017                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06018                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
06019                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
06020                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
06021                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
06022                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
06023                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
06024                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
06025                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
06026                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
06027                         /* -- */
06028                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
06029                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */
06030                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
06031                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
06032                         /* --- */
06033                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
06034                         "jnz            .L10372 \n\t"   /* check loop termination, proceed if required */
06035                         "add          $4, %%esi \n\t"   /* move to the next row in Src */
06036                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */
06037                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
06038                         "jnz            .L10370 \n\t"   /* check loop termination, proceed if required */
06039                         /* --- */
06040                         "emms                   \n\t"   /* exit MMX state */
06041                         "popa                   \n\t":"=m" (Dest)       /* %0 */
06042                         :"m"(Src),              /* %1 */
06043                         "m"(rows),              /* %2 */
06044                         "m"(columns),           /* %3 */
06045                         "m"(Kernel),            /* %4 */
06046                         "m"(NRightShift)        /* %5 */
06047                         );
06048 #endif
06049 #endif
06050                 return (0);
06051         } else {
06052                 /* No non-MMX implementation yet */
06053                 return (-1);
06054         }
06055 }
06056 
06071 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
06072                                                                                            signed short *Kernel, unsigned char NRightShift)
06073 {
06074         /* Validate input parameters */
06075         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
06076                 return(-1);
06077 
06078         if ((columns < 7) || (rows < 7) || (NRightShift > 7))
06079                 return (-1);
06080 
06081         if ((SDL_imageFilterMMXdetect())) {
06082 #ifdef USE_MMX
06083 #if !defined(GCC__)
06084                 __asm
06085                 {
06086                         pusha
06087                                 pxor mm0, mm0           /* zero MM0 */
06088                                 xor ebx, ebx    /* zero EBX */
06089                                 mov bl, NRightShift     /* load NRightShift into BL */
06090                                 movd mm5, ebx           /* copy NRightShift into MM5 */
06091                                 mov edx, Kernel         /* load Kernel address into EDX */
06092                                 mov esi, Src    /* load Src  address to ESI */
06093                                 mov edi, Dest           /* load Dest address to EDI */
06094                                 add edi, 3      /* 3 column offset from the left edge */
06095                                 mov eax, columns        /* load columns into EAX */
06096                                 add edi, eax    /* 3 row offset from the top edge */
06097                                 add edi, eax
06098                                 add edi, eax
06099                                 mov ebx, rows           /* initialize ROWS counter */
06100                                 sub ebx, 6      /* do not use first 3 and last 3 rows */
06101                                 /* ---, */
06102 L10380:
06103                         mov ecx, eax    /* initialize COLUMNS counter */
06104                                 sub ecx, 6      /* do not use first 3 and last 3 columns */
06105                                 align 16                        /* 16 byte alignment of the loop entry */
06106 L10382:
06107                         pxor mm7, mm7           /* zero MM7 (accumulator) */
06108                                 movd mm6, esi           /* save ESI in MM6 */
06109                                 /* --- 1 */
06110                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06111                         movq mm2, mm1           /* copy MM1 into MM2 */
06112                                 add esi, eax    /* move Src pointer 1 row below */
06113                                 movq mm3, [edx]         /* load 4 words of Kernel */
06114                         add edx, 8      /* move pointer to other 4 words */
06115                                 movq mm4, [edx]         /* load 4 words of Kernel */
06116                         add edx, 8      /* move pointer to other 4 words */
06117                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06118                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06119                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06120                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06121                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06122                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06123                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06124                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06125                                 /* --- 2 */
06126                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06127                         movq mm2, mm1           /* copy MM1 into MM2 */
06128                                 add esi, eax    /* move Src pointer 1 row below */
06129                                 movq mm3, [edx]         /* load 4 words of Kernel */
06130                         add edx, 8      /* move pointer to other 4 words */
06131                                 movq mm4, [edx]         /* load 4 words of Kernel */
06132                         add edx, 8      /* move pointer to other 4 words */
06133                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06134                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06135                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06136                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06137                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06138                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06139                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06140                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06141                                 /* --- 3 */
06142                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06143                         movq mm2, mm1           /* copy MM1 into MM2 */
06144                                 add esi, eax    /* move Src pointer 1 row below */
06145                                 movq mm3, [edx]         /* load 4 words of Kernel */
06146                         add edx, 8      /* move pointer to other 4 words */
06147                                 movq mm4, [edx]         /* load 4 words of Kernel */
06148                         add edx, 8      /* move pointer to other 4 words */
06149                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06150                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06151                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06152                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06153                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06154                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06155                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06156                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06157                                 /* --- 4 */
06158                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06159                         movq mm2, mm1           /* copy MM1 into MM2 */
06160                                 add esi, eax    /* move Src pointer 1 row below */
06161                                 movq mm3, [edx]         /* load 4 words of Kernel */
06162                         add edx, 8      /* move pointer to other 4 words */
06163                                 movq mm4, [edx]         /* load 4 words of Kernel */
06164                         add edx, 8      /* move pointer to other 4 words */
06165                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06166                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06167                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06168                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06169                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06170                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06171                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06172                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06173                                 /* --- 5 */
06174                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06175                         movq mm2, mm1           /* copy MM1 into MM2 */
06176                                 add esi, eax    /* move Src pointer 1 row below */
06177                                 movq mm3, [edx]         /* load 4 words of Kernel */
06178                         add edx, 8      /* move pointer to other 4 words */
06179                                 movq mm4, [edx]         /* load 4 words of Kernel */
06180                         add edx, 8      /* move pointer to other 4 words */
06181                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06182                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06183                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06184                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06185                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06186                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06187                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06188                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06189                                 /* --- 6 */
06190                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06191                         movq mm2, mm1           /* copy MM1 into MM2 */
06192                                 add esi, eax    /* move Src pointer 1 row below */
06193                                 movq mm3, [edx]         /* load 4 words of Kernel */
06194                         add edx, 8      /* move pointer to other 4 words */
06195                                 movq mm4, [edx]         /* load 4 words of Kernel */
06196                         add edx, 8      /* move pointer to other 4 words */
06197                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06198                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06199                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06200                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06201                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06202                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06203                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06204                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06205                                 /* --- 7 */
06206                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06207                         movq mm2, mm1           /* copy MM1 into MM2 */
06208                                 movq mm3, [edx]         /* load 4 words of Kernel */
06209                         add edx, 8      /* move pointer to other 4 words */
06210                                 movq mm4, [edx]         /* load 4 words of Kernel */
06211                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06212                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06213                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06214                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06215                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06216                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06217                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06218                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06219                                 /* ---, */
06220                                 movq mm3, mm7           /* copy MM7 into MM3 */
06221                                 psrlq mm7, 32           /* shift 2 left words to the right */
06222                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
06223                                 movq mm2, mm7           /* copy MM7 into MM2 */
06224                                 psrlq mm7, 16           /* shift 1 left word to the right */
06225                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
06226                                 movd mm1, eax           /* save EAX in MM1 */
06227                                 packuswb mm7, mm0       /* pack division result with saturation */
06228                                 movd eax, mm7           /* copy saturated result into EAX */
06229                                 mov [edi], al           /* copy a byte result into Dest */
06230                                 movd eax, mm1           /* restore saved EAX */
06231                                 /* --, */
06232                                 movd esi, mm6           /* move Src pointer to the top pixel */
06233                                 sub edx, 104    /* EDX = Kernel address */
06234                                 inc              esi            /* move Src  pointer to the next pixel */
06235                                 inc              edi            /* move Dest pointer to the next pixel */
06236                                 /* ---, */
06237                                 dec              ecx            /* decrease loop counter COLUMNS */
06238                                 jnz            L10382           /* check loop termination, proceed if required */
06239                                 add esi, 6      /* move to the next row in Src */
06240                                 add edi, 6      /* move to the next row in Dest */
06241                                 dec              ebx            /* decrease loop counter ROWS */
06242                                 jnz            L10380           /* check loop termination, proceed if required */
06243                                 /* ---, */
06244                                 emms                            /* exit MMX state */
06245                                 popa
06246                 }
06247 #else
06248                 asm volatile
06249                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06250                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
06251                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
06252                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
06253                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
06254                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
06255                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06256                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */
06257                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06258                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */
06259                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */
06260                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */
06261                         /* --- */
06262                         ".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
06263                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */
06264                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06265                         ".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
06266                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
06267                         /* --- 1 */
06268                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06269                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06270                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06271                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06272                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06273                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06274                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06275                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06276                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06277                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06278                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06279                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06280                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06281                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06282                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06283                         /* --- 2 */
06284                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06285                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06286                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06287                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06288                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06289                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06290                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06291                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06292                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06293                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06294                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06295                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06296                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06297                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06298                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06299                         /* --- 3 */
06300                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06301                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06302                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06303                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06304                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06305                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06306                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06307                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06308                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06309                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06310                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06311                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06312                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06313                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06314                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06315                         /* --- 4 */
06316                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06317                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06318                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06319                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06320                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06321                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06322                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06323                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06324                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06325                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06326                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06327                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06328                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06329                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06330                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06331                         /* --- 5 */
06332                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06333                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06334                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06335                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06336                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06337                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06338                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06339                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06340                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06341                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06342                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06343                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06344                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06345                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06346                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06347                         /* --- 6 */
06348                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06349                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06350                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06351                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06352                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06353                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06354                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06355                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06356                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06357                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06358                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06359                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06360                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06361                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06362                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06363                         /* --- 7 */
06364                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06365                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06366                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06367                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06368                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06369                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06370                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06371                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06372                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06373                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06374                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06375                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06376                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06377                         /* --- */
06378                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06379                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06380                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
06381                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
06382                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
06383                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
06384                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
06385                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
06386                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
06387                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
06388                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
06389                         /* -- */
06390                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
06391                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */
06392                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
06393                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
06394                         /* --- */
06395                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
06396                         "jnz            .L10382 \n\t"   /* check loop termination, proceed if required */
06397                         "add          $6, %%esi \n\t"   /* move to the next row in Src */
06398                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */
06399                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
06400                         "jnz            .L10380 \n\t"   /* check loop termination, proceed if required */
06401                         /* --- */
06402                         "emms                   \n\t"   /* exit MMX state */
06403                         "popa                   \n\t":"=m" (Dest)       /* %0 */
06404                         :"m"(Src),              /* %1 */
06405                         "m"(rows),              /* %2 */
06406                         "m"(columns),           /* %3 */
06407                         "m"(Kernel),            /* %4 */
06408                         "m"(NRightShift)        /* %5 */
06409                         );
06410 #endif
06411 #endif
06412                 return (0);
06413         } else {
06414                 /* No non-MMX implementation yet */
06415                 return (-1);
06416         }
06417 }
06418 
06433 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
06434                                                                                            signed short *Kernel, unsigned char NRightShift)
06435 {
06436         /* Validate input parameters */
06437         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
06438                 return(-1);
06439 
06440         if ((columns < 9) || (rows < 9) || (NRightShift > 7))
06441                 return (-1);
06442 
06443         if ((SDL_imageFilterMMXdetect())) {
06444 #ifdef USE_MMX
06445 #if !defined(GCC__)
06446                 __asm
06447                 {
06448                         pusha
06449                                 pxor mm0, mm0           /* zero MM0 */
06450                                 xor ebx, ebx    /* zero EBX */
06451                                 mov bl, NRightShift     /* load NRightShift into BL */
06452                                 movd mm5, ebx           /* copy NRightShift into MM5 */
06453                                 mov edx, Kernel         /* load Kernel address into EDX */
06454                                 mov esi, Src    /* load Src  address to ESI */
06455                                 mov edi, Dest           /* load Dest address to EDI */
06456                                 add edi, 4      /* 4 column offset from the left edge */
06457                                 mov eax, columns        /* load columns into EAX */
06458                                 add edi, eax    /* 4 row offset from the top edge */
06459                                 add edi, eax
06460                                 add edi, eax
06461                                 add edi, eax
06462                                 mov ebx, rows           /* initialize ROWS counter */
06463                                 sub ebx, 8      /* do not use first 4 and last 4 rows */
06464                                 /* ---, */
06465 L10390:
06466                         mov ecx, eax    /* initialize COLUMNS counter */
06467                                 sub ecx, 8      /* do not use first 4 and last 4 columns */
06468                                 align 16                        /* 16 byte alignment of the loop entry */
06469 L10392:
06470                         pxor mm7, mm7           /* zero MM7 (accumulator) */
06471                                 movd mm6, esi           /* save ESI in MM6 */
06472                                 /* --- 1 */
06473                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06474                         movq mm2, mm1           /* copy MM1 into MM2 */
06475                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06476                                 movq mm3, [edx]         /* load 4 words of Kernel */
06477                         add edx, 8      /* move pointer to other 4 words */
06478                                 movq mm4, [edx]         /* load 4 words of Kernel */
06479                         add edx, 8      /* move pointer to other 4 words */
06480                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06481                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06482                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06483                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06484                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06485                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06486                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06487                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06488                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06489                         dec              esi
06490                                 add esi, eax    /* move Src pointer 1 row below */
06491                                 movq mm3, [edx]         /* load 4 words of Kernel */
06492                         add edx, 8      /* move pointer to other 4 words */
06493                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06494                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06495                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06496                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06497                                 /* --- 2 */
06498                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06499                         movq mm2, mm1           /* copy MM1 into MM2 */
06500                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06501                                 movq mm3, [edx]         /* load 4 words of Kernel */
06502                         add edx, 8      /* move pointer to other 4 words */
06503                                 movq mm4, [edx]         /* load 4 words of Kernel */
06504                         add edx, 8      /* move pointer to other 4 words */
06505                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06506                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06507                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06508                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06509                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06510                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06511                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06512                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06513                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06514                         dec              esi
06515                                 add esi, eax    /* move Src pointer 1 row below */
06516                                 movq mm3, [edx]         /* load 4 words of Kernel */
06517                         add edx, 8      /* move pointer to other 4 words */
06518                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06519                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06520                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06521                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06522                                 /* --- 3 */
06523                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06524                         movq mm2, mm1           /* copy MM1 into MM2 */
06525                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06526                                 movq mm3, [edx]         /* load 4 words of Kernel */
06527                         add edx, 8      /* move pointer to other 4 words */
06528                                 movq mm4, [edx]         /* load 4 words of Kernel */
06529                         add edx, 8      /* move pointer to other 4 words */
06530                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06531                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06532                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06533                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06534                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06535                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06536                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06537                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06538                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06539                         dec              esi
06540                                 add esi, eax    /* move Src pointer 1 row below */
06541                                 movq mm3, [edx]         /* load 4 words of Kernel */
06542                         add edx, 8      /* move pointer to other 4 words */
06543                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06544                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06545                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06546                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06547                                 /* --- 4 */
06548                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06549                         movq mm2, mm1           /* copy MM1 into MM2 */
06550                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06551                                 movq mm3, [edx]         /* load 4 words of Kernel */
06552                         add edx, 8      /* move pointer to other 4 words */
06553                                 movq mm4, [edx]         /* load 4 words of Kernel */
06554                         add edx, 8      /* move pointer to other 4 words */
06555                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06556                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06557                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06558                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06559                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06560                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06561                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06562                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06563                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06564                         dec              esi
06565                                 add esi, eax    /* move Src pointer 1 row below */
06566                                 movq mm3, [edx]         /* load 4 words of Kernel */
06567                         add edx, 8      /* move pointer to other 4 words */
06568                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06569                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06570                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06571                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06572                                 /* --- 5 */
06573                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06574                         movq mm2, mm1           /* copy MM1 into MM2 */
06575                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06576                                 movq mm3, [edx]         /* load 4 words of Kernel */
06577                         add edx, 8      /* move pointer to other 4 words */
06578                                 movq mm4, [edx]         /* load 4 words of Kernel */
06579                         add edx, 8      /* move pointer to other 4 words */
06580                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06581                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06582                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06583                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06584                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06585                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06586                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06587                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06588                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06589                         dec              esi
06590                                 add esi, eax    /* move Src pointer 1 row below */
06591                                 movq mm3, [edx]         /* load 4 words of Kernel */
06592                         add edx, 8      /* move pointer to other 4 words */
06593                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06594                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06595                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06596                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06597                                 /* --- 6 */
06598                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06599                         movq mm2, mm1           /* copy MM1 into MM2 */
06600                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06601                                 movq mm3, [edx]         /* load 4 words of Kernel */
06602                         add edx, 8      /* move pointer to other 4 words */
06603                                 movq mm4, [edx]         /* load 4 words of Kernel */
06604                         add edx, 8      /* move pointer to other 4 words */
06605                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06606                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06607                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06608                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06609                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06610                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06611                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06612                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06613                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06614                         dec              esi
06615                                 add esi, eax    /* move Src pointer 1 row below */
06616                                 movq mm3, [edx]         /* load 4 words of Kernel */
06617                         add edx, 8      /* move pointer to other 4 words */
06618                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06619                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06620                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06621                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06622                                 /* --- 7 */
06623                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06624                         movq mm2, mm1           /* copy MM1 into MM2 */
06625                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06626                                 movq mm3, [edx]         /* load 4 words of Kernel */
06627                         add edx, 8      /* move pointer to other 4 words */
06628                                 movq mm4, [edx]         /* load 4 words of Kernel */
06629                         add edx, 8      /* move pointer to other 4 words */
06630                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06631                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06632                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06633                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06634                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06635                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06636                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06637                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06638                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06639                         dec              esi
06640                                 add esi, eax    /* move Src pointer 1 row below */
06641                                 movq mm3, [edx]         /* load 4 words of Kernel */
06642                         add edx, 8      /* move pointer to other 4 words */
06643                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06644                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06645                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06646                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06647                                 /* --- 8 */
06648                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06649                         movq mm2, mm1           /* copy MM1 into MM2 */
06650                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06651                                 movq mm3, [edx]         /* load 4 words of Kernel */
06652                         add edx, 8      /* move pointer to other 4 words */
06653                                 movq mm4, [edx]         /* load 4 words of Kernel */
06654                         add edx, 8      /* move pointer to other 4 words */
06655                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06656                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06657                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06658                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06659                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06660                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06661                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06662                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06663                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06664                         dec              esi
06665                                 add esi, eax    /* move Src pointer 1 row below */
06666                                 movq mm3, [edx]         /* load 4 words of Kernel */
06667                         add edx, 8      /* move pointer to other 4 words */
06668                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06669                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06670                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06671                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06672                                 /* --- 9 */
06673                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06674                         movq mm2, mm1           /* copy MM1 into MM2 */
06675                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06676                                 movq mm3, [edx]         /* load 4 words of Kernel */
06677                         add edx, 8      /* move pointer to other 4 words */
06678                                 movq mm4, [edx]         /* load 4 words of Kernel */
06679                         add edx, 8      /* move pointer to other 4 words */
06680                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06681                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06682                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06683                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06684                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06685                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06686                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06687                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06688                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06689                         movq mm3, [edx]         /* load 4 words of Kernel */
06690                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06691                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06692                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06693                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06694                                 /* ---, */
06695                                 movq mm3, mm7           /* copy MM7 into MM3 */
06696                                 psrlq mm7, 32           /* shift 2 left words to the right */
06697                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
06698                                 movq mm2, mm7           /* copy MM7 into MM2 */
06699                                 psrlq mm7, 16           /* shift 1 left word to the right */
06700                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
06701                                 movd mm1, eax           /* save EAX in MM1 */
06702                                 packuswb mm7, mm0       /* pack division result with saturation */
06703                                 movd eax, mm7           /* copy saturated result into EAX */
06704                                 mov [edi], al           /* copy a byte result into Dest */
06705                                 movd eax, mm1           /* restore saved EAX */
06706                                 /* --, */
06707                                 movd esi, mm6           /* move Src pointer to the top pixel */
06708                                 sub edx, 208    /* EDX = Kernel address */
06709                                 inc              esi            /* move Src  pointer to the next pixel */
06710                                 inc              edi            /* move Dest pointer to the next pixel */
06711                                 /* ---, */
06712                                 dec              ecx            /* decrease loop counter COLUMNS */
06713                                 jnz            L10392           /* check loop termination, proceed if required */
06714                                 add esi, 8      /* move to the next row in Src */
06715                                 add edi, 8      /* move to the next row in Dest */
06716                                 dec              ebx            /* decrease loop counter ROWS */
06717                                 jnz            L10390           /* check loop termination, proceed if required */
06718                                 /* ---, */
06719                                 emms                            /* exit MMX state */
06720                                 popa
06721                 }
06722 #else
06723                 asm volatile
06724                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06725                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
06726                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
06727                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
06728                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
06729                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
06730                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06731                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */
06732                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06733                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */
06734                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */
06735                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */
06736                         /* --- */
06737                         ".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
06738                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */
06739                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06740                         ".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
06741                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
06742                         /* --- 1 */
06743                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06744                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06745                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06746                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06747                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06748                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06749                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06750                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06751                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06752                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06753                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06754                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06755                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06756                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06757                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06758                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06759                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06760                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06761                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06762                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06763                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06764                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06765                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06766                         /* --- 2 */
06767                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06768                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06769                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06770                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06771                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06772                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06773                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06774                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06775                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06776                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06777                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06778                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06779                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06780                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06781                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06782                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06783                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06784                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06785                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06786                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06787                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06788                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06789                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06790                         /* --- 3 */
06791                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06792                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06793                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06794                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06795                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06796                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06797                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06798                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06799                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06800                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06801                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06802                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06803                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06804                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06805                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06806                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06807                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06808                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06809                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06810                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06811                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06812                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06813                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06814                         /* --- 4 */
06815                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06816                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06817                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06818                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06819                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06820                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06821                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06822                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06823                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06824                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06825                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06826                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06827                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06828                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06829                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06830                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06831                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06832                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06833                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06834                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06835                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06836                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06837                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06838                         /* --- 5 */
06839                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06840                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06841                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06842                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06843                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06844                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06845                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06846                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06847                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06848                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06849                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06850                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06851                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06852                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06853                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06854                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06855                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06856                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06857                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06858                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06859                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06860                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06861                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06862                         /* --- 6 */
06863                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06864                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06865                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06866                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06867                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06868                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06869                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06870                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06871                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06872                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06873                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06874                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06875                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06876                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06877                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06878                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06879                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06880                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06881                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06882                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06883                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06884                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06885                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06886                         /* --- 7 */
06887                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06888                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06889                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06890                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06891                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06892                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06893                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06894                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06895                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06896                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06897                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06898                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06899                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06900                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06901                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06902                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06903                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06904                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06905                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06906                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06907                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06908                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06909                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06910                         /* --- 8 */
06911                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06912                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06913                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06914                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06915                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06916                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06917                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06918                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06919                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06920                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06921                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06922                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06923                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06924                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06925                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06926                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06927                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06928                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06929                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06930                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06931                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06932                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06933                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06934                         /* --- 9 */
06935                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06936                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06937                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06938                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06939                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06940                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06941                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06942                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06943                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06944                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06945                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06946                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06947                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06948                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06949                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06950                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06951                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06952                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06953                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06954                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06955                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06956                         /* --- */
06957                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06958                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06959                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
06960                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
06961                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
06962                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
06963                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
06964                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
06965                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
06966                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
06967                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
06968                         /* -- */
06969                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
06970                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */
06971                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
06972                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
06973                         /* --- */
06974                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
06975                         "jnz            .L10392 \n\t"   /* check loop termination, proceed if required */
06976                         "add          $8, %%esi \n\t"   /* move to the next row in Src */
06977                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */
06978                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
06979                         "jnz            .L10390 \n\t"   /* check loop termination, proceed if required */
06980                         /* --- */
06981                         "emms                   \n\t"   /* exit MMX state */
06982                         "popa                   \n\t":"=m" (Dest)       /* %0 */
06983                         :"m"(Src),              /* %1 */
06984                         "m"(rows),              /* %2 */
06985                         "m"(columns),           /* %3 */
06986                         "m"(Kernel),            /* %4 */
06987                         "m"(NRightShift)        /* %5 */
06988                         );
06989 #endif
06990 #endif
06991                 return (0);
06992         } else {
06993                 /* No non-MMX implementation yet */
06994                 return (-1);
06995         }
06996 }
06997 
06998 /* ------------------------------------------------------------------------------------ */
06999 
07012 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
07013 {
07014         /* Validate input parameters */
07015         if ((Src == NULL) || (Dest == NULL))
07016                 return(-1);
07017 
07018         if ((columns < 8) || (rows < 3))
07019                 return (-1);
07020 
07021         if ((SDL_imageFilterMMXdetect())) {
07022 #ifdef USE_MMX
07023 #if !defined(GCC__)
07024                 __asm
07025                 {
07026                         pusha
07027                                 pxor mm0, mm0           /* zero MM0 */
07028                                 mov eax, columns        /* load columns into EAX */
07029                                 /* ---, */
07030                                 mov esi, Src    /* ESI = Src row 0 address */
07031                                 mov edi, Dest           /* load Dest address to EDI */
07032                                 add edi, eax    /* EDI = EDI + columns */
07033                                 inc              edi            /* 1 byte offset from the left edge */
07034                                 mov edx, rows           /* initialize ROWS counter */
07035                                 sub edx, 2      /* do not use first and last rows */
07036                                 /* ---, */
07037 L10400:
07038                         mov ecx, eax    /* initialize COLUMS counter */
07039                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */
07040                                 mov ebx, esi    /* save ESI in EBX */
07041                                 movd mm1, edi           /* save EDI in MM1 */
07042                                 align 16                        /* 16 byte alignment of the loop entry */
07043 L10402:
07044                         /* ---, */
07045                         movq mm4, [esi]         /* load 8 bytes from Src */
07046                         movq mm5, mm4           /* save MM4 in MM5 */
07047                                 add esi, 2      /* move ESI pointer 2 bytes right */
07048                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */
07049                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */
07050                                 movq mm6, [esi]         /* load 8 bytes from Src */
07051                         movq mm7, mm6           /* save MM6 in MM7 */
07052                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07053                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */
07054                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */
07055                                 add esi, eax    /* move to the next row of Src */
07056                                 movq mm2, [esi]         /* load 8 bytes from Src */
07057                         movq mm3, mm2           /* save MM2 in MM3 */
07058                                 add esi, 2      /* move ESI pointer 2 bytes right */
07059                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07060                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07061                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07062                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07063                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07064                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07065                                 movq mm2, [esi]         /* load 8 bytes from Src */
07066                         movq mm3, mm2           /* save MM2 in MM3 */
07067                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07068                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07069                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07070                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07071                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07072                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07073                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07074                                 add esi, eax    /* move to the next row of Src */
07075                                 movq mm2, [esi]         /* load 8 bytes from Src */
07076                         movq mm3, mm2           /* save MM2 in MM3 */
07077                                 add esi, 2      /* move ESI pointer 2 bytes right */
07078                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07079                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07080                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07081                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07082                                 movq mm2, [esi]         /* load 8 bytes from Src */
07083                         movq mm3, mm2           /* save MM2 in MM3 */
07084                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07085                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07086                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07087                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07088                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07089                                 /* ---, */
07090                                 movq mm2, mm4           /* copy MM4 into MM2 */
07091                                 psrlq mm4, 32           /* shift 2 left words to the right */
07092                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */
07093                                 movq mm3, mm6           /* copy MM6 into MM3 */
07094                                 psrlq mm6, 32           /* shift 2 left words to the right */
07095                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */
07096                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */
07097                                 movq mm2, mm5           /* copy MM6 into MM2 */
07098                                 psrlq mm5, 32           /* shift 2 left words to the right */
07099                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */
07100                                 movq mm3, mm7           /* copy MM7 into MM3 */
07101                                 psrlq mm7, 32           /* shift 2 left words to the right */
07102                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */
07103                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */
07104                                 /* Take abs values of MM4 and MM5 */
07105                                 movq mm6, mm4           /* copy MM4 into MM6 */
07106                                 movq mm7, mm5           /* copy MM5 into MM7 */
07107                                 psraw mm6, 15           /* fill MM6 words with word sign bit */
07108                                 psraw mm7, 15           /* fill MM7 words with word sign bit */
07109                                 pxor mm4, mm6           /* take 1's compliment of only neg words */
07110                                 pxor mm5, mm7           /* take 1's compliment of only neg words */
07111                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
07112                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */
07113                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */
07114                                 movq [edi], mm4         /* store result in Dest */
07115                                 /* ---, */
07116                                 sub esi, eax    /* move to the current top row in Src */
07117                                 sub esi, eax
07118                                 add esi, 8      /* move Src  pointer to the next 8 pixels */
07119                                 add edi, 8      /* move Dest pointer to the next 8 pixels */
07120                                 /* ---, */
07121                                 dec              ecx            /* decrease loop counter COLUMNS */
07122                                 jnz            L10402           /* check loop termination, proceed if required */
07123                                 mov esi, ebx    /* restore most left current row Src  address */
07124                                 movd edi, mm1           /* restore most left current row Dest address */
07125                                 add esi, eax    /* move to the next row in Src */
07126                                 add edi, eax    /* move to the next row in Dest */
07127                                 dec              edx            /* decrease loop counter ROWS */
07128                                 jnz            L10400           /* check loop termination, proceed if required */
07129                                 /* ---, */
07130                                 emms                            /* exit MMX state */
07131                                 popa
07132                 }
07133 #else
07134                 asm volatile
07135                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
07136                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
07137                         /* --- */
07138                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
07139                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
07140                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
07141                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
07142                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
07143                         "sub          $2, %%edx \n\t"   /* do not use first and last rows */
07144                         /* --- */
07145                         ".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */
07146                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */
07147                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */
07148                         "movd      %%edi, %%mm1 \n\t"   /* save EDI in MM1 */
07149                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
07150                         ".L10402:               \n\t"
07151                         /* --- */
07152                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */
07153                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */
07154                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07155                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */
07156                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */
07157                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */
07158                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */
07159                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07160                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */
07161                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */
07162                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07163                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07164                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07165                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07166                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07167                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07168                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07169                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07170                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07171                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07172                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07173                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07174                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07175                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07176                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07177                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07178                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07179                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07180                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07181                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07182                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07183                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07184                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07185                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07186                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07187                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07188                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07189                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07190                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07191                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07192                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07193                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07194                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07195                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07196                         /* --- */
07197                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */
07198                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */
07199                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */
07200                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */
07201                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */
07202                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */
07203                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */
07204                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */
07205                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */
07206                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */
07207                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
07208                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
07209                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */
07210                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */
07211                         /* Take abs values of MM4 and MM5 */
07212                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */
07213                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */
07214                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */
07215                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */
07216                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
07217                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */
07218                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07219                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07220                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */
07221                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */
07222                         /* --- */
07223                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */
07224                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */
07225                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */
07226                         /* --- */
07227                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
07228                         "jnz            .L10402 \n\t"   /* check loop termination, proceed if required */
07229                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */
07230                         "movd      %%mm1, %%edi \n\t"   /* restore most left current row Dest address */
07231                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */
07232                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */
07233                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
07234                         "jnz            .L10400 \n\t"   /* check loop termination, proceed if required */
07235                         /* --- */
07236                         "emms                   \n\t"   /* exit MMX state */
07237                         "popa                   \n\t":"=m" (Dest)       /* %0 */
07238                         :"m"(Src),              /* %1 */
07239                         "m"(rows),              /* %2 */
07240                         "m"(columns)            /* %3 */
07241                         );
07242 #endif
07243 #endif
07244                 return (0);
07245         } else {
07246                 /* No non-MMX implementation yet */
07247                 return (-1);
07248         }
07249 }
07250 
07264 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
07265                                                                         unsigned char NRightShift)
07266 {
07267         /* Validate input parameters */
07268         if ((Src == NULL) || (Dest == NULL))
07269                 return(-1);
07270         if ((columns < 8) || (rows < 3) || (NRightShift > 7))
07271                 return (-1);
07272 
07273         if ((SDL_imageFilterMMXdetect())) {
07274 #ifdef USE_MMX
07275 #if !defined(GCC__)
07276                 __asm
07277                 {
07278                         pusha
07279                                 pxor mm0, mm0           /* zero MM0 */
07280                                 mov eax, columns        /* load columns into EAX */
07281                                 xor ebx, ebx    /* zero EBX */
07282                                 mov bl, NRightShift     /* load NRightShift into BL */
07283                                 movd mm1, ebx           /* copy NRightShift into MM1 */
07284                                 /* ---, */
07285                                 mov esi, Src    /* ESI = Src row 0 address */
07286                                 mov edi, Dest           /* load Dest address to EDI */
07287                                 add edi, eax    /* EDI = EDI + columns */
07288                                 inc              edi            /* 1 byte offset from the left edge */
07289                                 /* initialize ROWS counter */
07290                                 sub rows, 2     /* do not use first and last rows */
07291                                 /* ---, */
07292 L10410:
07293                         mov ecx, eax    /* initialize COLUMS counter */
07294                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */
07295                                 mov ebx, esi    /* save ESI in EBX */
07296                                 mov edx, edi    /* save EDI in EDX */
07297                                 align 16                        /* 16 byte alignment of the loop entry */
07298 L10412:
07299                         /* ---, */
07300                         movq mm4, [esi]         /* load 8 bytes from Src */
07301                         movq mm5, mm4           /* save MM4 in MM5 */
07302                                 add esi, 2      /* move ESI pointer 2 bytes right */
07303                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */
07304                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */
07305                                 psrlw mm4, mm1          /* shift right each pixel NshiftRight times */
07306                                 psrlw mm5, mm1          /* shift right each pixel NshiftRight times */
07307                                 movq mm6, [esi]         /* load 8 bytes from Src */
07308                         movq mm7, mm6           /* save MM6 in MM7 */
07309                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07310                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */
07311                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */
07312                                 psrlw mm6, mm1          /* shift right each pixel NshiftRight times */
07313                                 psrlw mm7, mm1          /* shift right each pixel NshiftRight times */
07314                                 add esi, eax    /* move to the next row of Src */
07315                                 movq mm2, [esi]         /* load 8 bytes from Src */
07316                         movq mm3, mm2           /* save MM2 in MM3 */
07317                                 add esi, 2      /* move ESI pointer 2 bytes right */
07318                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07319                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07320                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07321                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07322                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07323                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07324                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07325                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07326                                 movq mm2, [esi]         /* load 8 bytes from Src */
07327                         movq mm3, mm2           /* save MM2 in MM3 */
07328                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07329                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07330                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07331                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07332                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07333                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07334                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07335                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07336                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07337                                 add esi, eax    /* move to the next row of Src */
07338                                 movq mm2, [esi]         /* load 8 bytes from Src */
07339                         movq mm3, mm2           /* save MM2 in MM3 */
07340                                 add esi, 2      /* move ESI pointer 2 bytes right */
07341                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07342                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07343                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07344                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07345                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07346                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07347                                 movq mm2, [esi]         /* load 8 bytes from Src */
07348                         movq mm3, mm2           /* save MM2 in MM3 */
07349                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07350                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07351                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07352                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07353                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07354                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07355                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07356                                 /* ---, */
07357                                 movq mm2, mm4           /* copy MM4 into MM2 */
07358                                 psrlq mm4, 32           /* shift 2 left words to the right */
07359                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */
07360                                 movq mm3, mm6           /* copy MM6 into MM3 */
07361                                 psrlq mm6, 32           /* shift 2 left words to the right */
07362                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */
07363                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */
07364                                 movq mm2, mm5           /* copy MM6 into MM2 */
07365                                 psrlq mm5, 32           /* shift 2 left words to the right */
07366                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */
07367                                 movq mm3, mm7           /* copy MM7 into MM3 */
07368                                 psrlq mm7, 32           /* shift 2 left words to the right */
07369                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */
07370                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */
07371                                 /* Take abs values of MM4 and MM5 */
07372                                 movq mm6, mm4           /* copy MM4 into MM6 */
07373                                 movq mm7, mm5           /* copy MM5 into MM7 */
07374                                 psraw mm6, 15           /* fill MM6 words with word sign bit */
07375                                 psraw mm7, 15           /* fill MM7 words with word sign bit */
07376                                 pxor mm4, mm6           /* take 1's compliment of only neg words */
07377                                 pxor mm5, mm7           /* take 1's compliment of only neg words */
07378                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
07379                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */
07380                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */
07381                                 movq [edi], mm4         /* store result in Dest */
07382                                 /* ---, */
07383                                 sub esi, eax    /* move to the current top row in Src */
07384                                 sub esi, eax
07385                                 add esi, 8      /* move Src  pointer to the next 8 pixels */
07386                                 add edi, 8      /* move Dest pointer to the next 8 pixels */
07387                                 /* ---, */
07388                                 dec              ecx            /* decrease loop counter COLUMNS */
07389                                 jnz            L10412           /* check loop termination, proceed if required */
07390                                 mov esi, ebx    /* restore most left current row Src  address */
07391                                 mov edi, edx    /* restore most left current row Dest address */
07392                                 add esi, eax    /* move to the next row in Src */
07393                                 add edi, eax    /* move to the next row in Dest */
07394                                 dec rows        /* decrease loop counter ROWS */
07395                                 jnz            L10410           /* check loop termination, proceed if required */
07396                                 /* ---, */
07397                                 emms                            /* exit MMX state */
07398                                 popa
07399                 }
07400 #else
07401                 asm volatile
07402                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
07403                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
07404                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
07405                         "mov           %4, %%bl \n\t"   /* load NRightShift into BL */
07406                         "movd      %%ebx, %%mm1 \n\t"   /* copy NRightShift into MM1 */
07407                         /* --- */
07408                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
07409                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
07410                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
07411                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
07412                         /* initialize ROWS counter */
07413                         "subl            $2, %2 \n\t"   /* do not use first and last rows */
07414                         /* --- */
07415                         ".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */
07416                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */
07417                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */
07418                         "mov       %%edi, %%edx \n\t"   /* save EDI in EDX */
07419                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
07420                         ".L10412:               \n\t"
07421                         /* --- */
07422                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */
07423                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */
07424                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07425                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */
07426                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */
07427                         "psrlw     %%mm1, %%mm4 \n\t"   /* shift right each pixel NshiftRight times */
07428                         "psrlw     %%mm1, %%mm5 \n\t"   /* shift right each pixel NshiftRight times */
07429                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */
07430                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */
07431                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07432                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */
07433                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */
07434                         "psrlw     %%mm1, %%mm6 \n\t"   /* shift right each pixel NshiftRight times */
07435                         "psrlw     %%mm1, %%mm7 \n\t"   /* shift right each pixel NshiftRight times */
07436                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07437                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07438                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07439                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07440                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07441                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07442                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07443                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07444                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07445                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07446                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07447                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07448                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07449                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07450                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07451                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07452                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07453                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07454                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07455                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07456                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07457                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07458                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07459                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07460                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07461                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07462                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07463                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07464                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07465                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07466                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07467                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07468                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07469                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07470                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07471                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07472                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07473                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07474                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07475                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07476                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07477                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07478                         /* --- */
07479                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */
07480                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */
07481                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */
07482                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */
07483                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */
07484                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */
07485                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */
07486                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */
07487                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */
07488                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */
07489                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
07490                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
07491                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */
07492                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */
07493                         /* Take abs values of MM4 and MM5 */
07494                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */
07495                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */
07496                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */
07497                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */
07498                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
07499                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */
07500                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07501                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07502                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */
07503                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */
07504                         /* --- */
07505                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */
07506                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */
07507                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */
07508                         /* --- */
07509                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
07510                         "jnz            .L10412 \n\t"   /* check loop termination, proceed if required */
07511                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */
07512                         "mov       %%edx, %%edi \n\t"   /* restore most left current row Dest address */
07513                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */
07514                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */
07515                         "decl                %2 \n\t"   /* decrease loop counter ROWS */
07516                         "jnz            .L10410 \n\t"   /* check loop termination, proceed if required */
07517                         /* --- */
07518                         "emms                   \n\t"   /* exit MMX state */
07519                         "popa                   \n\t":"=m" (Dest)       /* %0 */
07520                         :"m"(Src),              /* %1 */
07521                         "m"(rows),              /* %2 */
07522                         "m"(columns),           /* %3 */
07523                         "m"(NRightShift)        /* %4 */
07524                         );
07525 #endif
07526 #endif
07527                 return (0);
07528         } else {
07529                 /* No non-MMX implementation yet */
07530                 return (-1);
07531         }
07532 }
07533 
07537 void SDL_imageFilterAlignStack(void)
07538 {
07539 #ifdef USE_MMX
07540 #if !defined(GCC__)
07541         __asm
07542         {                               /* --- stack alignment --- */
07543                 mov ebx, esp    /* load ESP into EBX */
07544                         sub ebx, 4      /* reserve space on stack for old value of ESP */
07545                         and ebx, -32    /* align EBX along a 32 byte boundary */
07546                         mov [ebx], esp          /* save old value of ESP in stack, behind the bndry */
07547                         mov esp, ebx    /* align ESP along a 32 byte boundary */
07548         }
07549 #else
07550         asm volatile
07551                 (                               /* --- stack alignment --- */
07552                 "mov       %%esp, %%ebx \n\t"   /* load ESP into EBX */
07553                 "sub          $4, %%ebx \n\t"   /* reserve space on stack for old value of ESP */
07554                 "and        $-32, %%ebx \n\t"   /* align EBX along a 32 byte boundary */
07555                 "mov     %%esp, (%%ebx) \n\t"   /* save old value of ESP in stack, behind the bndry */
07556                 "mov       %%ebx, %%esp \n\t"   /* align ESP along a 32 byte boundary */
07557                 ::);
07558 #endif
07559 #endif
07560 }
07561 
07565 void SDL_imageFilterRestoreStack(void)
07566 {
07567 #ifdef USE_MMX
07568 #if !defined(GCC__)
07569         __asm
07570         {                               /* --- restoring old stack --- */
07571                 mov ebx, [esp]          /* load old value of ESP */
07572                 mov esp, ebx    /* restore old value of ESP */
07573         }
07574 #else
07575         asm volatile
07576                 (                               /* --- restoring old stack --- */
07577                 "mov     (%%esp), %%ebx \n\t"   /* load old value of ESP */
07578                 "mov       %%ebx, %%esp \n\t"   /* restore old value of ESP */
07579                 ::);
07580 #endif
07581 #endif
07582 }