Code

Initial revision
[rrdtool.git] / libraries / libpng-1.0.9 / pnggccrd.c
1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2  *
3  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4  *
5  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7  *     for Intel's performance analysis of the MMX vs. non-MMX code.
8  *
9  * libpng 1.0.9 - January 31, 2001
10  * For conditions of distribution and use, see copyright notice in png.h
11  * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12  * Copyright (c) 1998, Intel Corporation
13  *
14  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15  * Interface to libpng contributed by Gilles Vollant, 1999.
16  * GNU C port by Greg Roelofs, 1999-2001.
17  *
18  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19  *
20  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21  *
22  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
23  *
24  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25  *        is required to assemble the newer MMX instructions such as movq.
26  *        For djgpp, see
27  *
28  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29  *
30  *        (or a later version in the same directory).  For Linux, check your
31  *        distribution's web site(s) or try these links:
32  *
33  *           http://rufus.w3.org/linux/RPM/binutils.html
34  *           http://www.debian.org/Packages/stable/devel/binutils.html
35  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36  *             binutils.tgz
37  *
38  *        For other platforms, see the main GNU site:
39  *
40  *           ftp://ftp.gnu.org/pub/gnu/binutils/
41  *
42  *        Version 2.5.2l.15 is definitely too old...
43  */
45 /*
46  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47  * =====================================
48  *
49  * 19991006:
50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51  *
52  * 19991007:
53  *  - additional optimizations (possible or definite):
54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57  *        why subtract 8 from width_mmx in the pass 4/5 case?
58  *        (only width_mmx case) (near line 1606)
59  *     x [DONE] replace pixel_bytes within each block with the true
60  *        constant value (or are compilers smart enough to do that?)
61  *     - rewrite all MMX interlacing code so it's aligned with
62  *        the *beginning* of the row buffer, not the end.  This
63  *        would not only allow one to eliminate half of the memory
64  *        writes for odd passes (that is, pass == odd), it may also
65  *        eliminate some unaligned-data-access exceptions (assuming
66  *        there's a penalty for not aligning 64-bit accesses on
67  *        64-bit boundaries).  The only catch is that the "leftover"
68  *        pixel(s) at the end of the row would have to be saved,
69  *        but there are enough unused MMX registers in every case,
70  *        so this is not a problem.  A further benefit is that the
71  *        post-MMX cleanup code (C code) in at least some of the
72  *        cases could be done within the assembler block.
73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74  *     inconsistent, and don't match the MMX Programmer's Reference
75  *     Manual conventions anyway.  They should be changed to
76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77  *     was lowest in memory (e.g., corresponding to a left pixel)
78  *     and b7 is the byte that was highest (e.g., a right pixel).
79  *
80  * 19991016:
81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82  *     want globals prefixed by underscores when referencing them--
83  *     i.e., if the variable is const4, then refer to it as const4,
84  *     not _const4.  This seems to be a djgpp-specific requirement.
85  *     Also, such variables apparently *must* be declared outside
86  *     of functions; neither static nor automatic variables work if
87  *     defined within the scope of a single function, but both
88  *     static and truly global (multi-module) variables work fine.
89  *
90  * 19991023:
91  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92  *  - switched from string-concatenation-with-macros to cleaner method of
93  *     renaming global variables for djgpp--i.e., always use prefixes in
94  *     inlined assembler code (== strings) and conditionally rename the
95  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
96  *
97  * 19991024:
98  *  - fixed mmxsupport()/png_do_interlace() first-row bug
99  *     This one was severely weird:  even though mmxsupport() doesn't touch
100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
101  *     the register (even in static/non-fPIC code--see below), which in turn
102  *     caused png_do_interlace() to return prematurely on the first row of
103  *     interlaced images (i.e., without expanding the interlaced pixels).
104  *     Inspection of the generated assembly code didn't turn up any clues,
105  *     although it did point at a minor optimization (i.e., get rid of
106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
107  *     instruction is more destructive than it looks?  (Not yet checked.)
108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109  *     listings...  Apparently register spillage has to do with ebx, since
110  *     it's used to index the global offset table.  Commenting it out of the
111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
113  *
114  * 19991107:
115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
117  *
118  * 19991120:
119  *  - made "diff" variable (now "_dif") global to simplify conversion of
120  *     filtering routines (running out of regs, sigh).  "diff" is still used
121  *     in interlacing routines, however.
122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123  *     macro determines which is used); original not yet tested.
124  *
125  * 20000213:
126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
127  *
128  * 20000319:
129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130  *     pass == 4 or 5, that caused visible corruption of interlaced images
131  *
132  * 20000623:
133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138  *     for the original (anonymous) SourceForge bug report.
139  *
140  * 20000706:
141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142  *       pnggccrd.c: In function `png_combine_row':
143  *       pnggccrd.c:525: more than 10 operands in `asm'
144  *       pnggccrd.c:669: more than 10 operands in `asm'
145  *       pnggccrd.c:828: more than 10 operands in `asm'
146  *       pnggccrd.c:994: more than 10 operands in `asm'
147  *       pnggccrd.c:1177: more than 10 operands in `asm'
148  *     They are all the same problem and can be worked around by using the
149  *     global _unmask variable unconditionally, not just in the -fPIC case.
150  *     Reportedly earlier versions of gcc also have the problem with more than
151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
152  *
153  * 20000729:
154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
156  *  - to finish remaining sections:
157  *     - clean up indentation and comments
158  *     - preload local variables
159  *     - add output and input regs (order of former determines numerical
160  *        mapping of latter)
161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
163  *
164  * 20000731:
165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166  *
167  * 20000822:
168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169  *     shared-library (-fPIC) version!  Code works just fine as part of static
170  *     library.  Damn damn damn damn damn, should have tested that sooner.
171  *     ebx is getting clobbered again (explicitly this time); need to save it
172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
173  *
174  * 20000823:
175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
177  *     and *Mask* globals and got rid of leading "$" signs.
178  *
179  * 20000826:
180  *  - added visual separators to help navigate microscopic printed copies
181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182  *     on png_read_filter_row_mmx_avg()
183  *
184  * 20000828:
185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
187  *     cleaned up/shortened in either routine, but functionality is complete
188  *     and seems to be working fine.
189  *
190  * 20000829:
191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
194  *     is simple enough...
195  *
196  * 20000914:
197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
198  *     correctly (but 48-bit RGB just fine)
199  *
200  * 20000916:
201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
205  *
206  * 20010103:
207  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
208  *     and made it public
209  *
210  * 20010104:
211  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
212  *     within MMX version of png_read_filter_row()) so no longer necessary to
213  *     compile it into pngrutil.o
214  *
215  * STILL TO DO:
216  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
217  *     - write MMX code for 48-bit case (pixel_bytes == 6)
218  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
219  *        why subtract 8 from width_mmx in the pass 4/5 case?
220  *        (only width_mmx case) (near line 1606)
221  *     - rewrite all MMX interlacing code so it's aligned with beginning
222  *        of the row buffer, not the end (see 19991007 for details)
223  *     x pick one version of mmxsupport() and get rid of the other
224  *     - add error messages to any remaining bogus default cases
225  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
226  *     - add support for runtime enable/disable/query of various MMX routines
227  */
229 //#define PNG_DEBUG 2   // GRR
231 #define PNG_INTERNAL
232 #include "png.h"
234 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
236 #ifdef PNG_USE_LOCAL_ARRAYS
237 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
238 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
239 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
240 #endif
242 // djgpp, Win32, and Cygwin add their own underscores to global variables,
243 // so define them without:
244 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
245 #  define _mmx_supported  mmx_supported
246 #  define _unmask         unmask
247 #  define _const4         const4
248 #  define _const6         const6
249 #  define _mask8_0        mask8_0
250 #  define _mask16_1       mask16_1
251 #  define _mask16_0       mask16_0
252 #  define _mask24_2       mask24_2
253 #  define _mask24_1       mask24_1
254 #  define _mask24_0       mask24_0
255 #  define _mask32_3       mask32_3
256 #  define _mask32_2       mask32_2
257 #  define _mask32_1       mask32_1
258 #  define _mask32_0       mask32_0
259 #  define _mask48_5       mask48_5
260 #  define _mask48_4       mask48_4
261 #  define _mask48_3       mask48_3
262 #  define _mask48_2       mask48_2
263 #  define _mask48_1       mask48_1
264 #  define _mask48_0       mask48_0
265 #  define _FullLength     FullLength
266 #  define _MMXLength      MMXLength
267 #  define _dif            dif
268 #  define _LBCarryMask    LBCarryMask
269 #  define _HBClearMask    HBClearMask
270 #  define _ActiveMask     ActiveMask
271 #  define _ActiveMask2    ActiveMask2
272 #  define _ActiveMaskEnd  ActiveMaskEnd
273 #  define _ShiftBpp       ShiftBpp
274 #  define _ShiftRem       ShiftRem
275 #  define _patemp         patemp
276 #  define _pbtemp         pbtemp
277 #  define _pctemp         pctemp
278 #endif
280 static int _mmx_supported = 2;
282 /* These constants are used in the inlined MMX assembly code.
283    Ignore gcc's "At top level: defined but not used" warnings. */
285 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
286  *  since that case uses the %ebx register for indexing the Global Offset Table
287  *  and there were no other registers available.  But gcc 2.95 and later emit
288  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
289  *  in the non-PIC case, so we'll just use the global unconditionally now.
290  */
291 static int _unmask;
293 static unsigned long long _mask8_0  = 0x0102040810204080LL;
295 static unsigned long long _mask16_1 = 0x0101020204040808LL;
296 static unsigned long long _mask16_0 = 0x1010202040408080LL;
298 static unsigned long long _mask24_2 = 0x0101010202020404LL;
299 static unsigned long long _mask24_1 = 0x0408080810101020LL;
300 static unsigned long long _mask24_0 = 0x2020404040808080LL;
302 static unsigned long long _mask32_3 = 0x0101010102020202LL;
303 static unsigned long long _mask32_2 = 0x0404040408080808LL;
304 static unsigned long long _mask32_1 = 0x1010101020202020LL;
305 static unsigned long long _mask32_0 = 0x4040404080808080LL;
307 static unsigned long long _mask48_5 = 0x0101010101010202LL;
308 static unsigned long long _mask48_4 = 0x0202020204040404LL;
309 static unsigned long long _mask48_3 = 0x0404080808080808LL;
310 static unsigned long long _mask48_2 = 0x1010101010102020LL;
311 static unsigned long long _mask48_1 = 0x2020202040404040LL;
312 static unsigned long long _mask48_0 = 0x4040808080808080LL;
314 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
315 //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
316 static unsigned long long _const6   = 0x00000000000000FFLL;
318 // These are used in the row-filter routines and should/would be local
319 //  variables if not for gcc addressing limitations.
321 static png_uint_32  _FullLength;
322 static png_uint_32  _MMXLength;
323 static int          _dif;
324 static int          _patemp;    // temp variables for Paeth routine
325 static int          _pbtemp;
326 static int          _pctemp;
331 //===========================================================================//
332 //                                                                           //
333 //                       P N G _ C O M B I N E _ R O W                       //
334 //                                                                           //
335 //===========================================================================//
337 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
339 /* Combines the row recently read in with the previous row.
340    This routine takes care of alpha and transparency if requested.
341    This routine also handles the two methods of progressive display
342    of interlaced images, depending on the mask value.
343    The mask value describes which pixels are to be combined with
344    the row.  The pattern always repeats every 8 pixels, so just 8
345    bits are needed.  A one indicates the pixel is to be combined; a
346    zero indicates the pixel is to be skipped.  This is in addition
347    to any alpha or transparency value associated with the pixel.
348    If you want all pixels to be combined, pass 0xff (255) in mask. */
350 /* Use this routine for the x86 platform - it uses a faster MMX routine
351    if the machine supports MMX. */
353 void /* PRIVATE */
354 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
356    png_debug(1,"in png_combine_row_asm\n");
358    if (_mmx_supported == 2) {
359        png_mmx_support();
360    }
362    if (mask == 0xff)
363    {
364       png_memcpy(row, png_ptr->row_buf + 1,
365        (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
366    }
367    /* GRR:  png_combine_row() never called with mask == 0 */
368    else
369    {
370       switch (png_ptr->row_info.pixel_depth)
371       {
372          case 1:        // png_ptr->row_info.pixel_depth
373          {
374             png_bytep sp;
375             png_bytep dp;
376             int s_inc, s_start, s_end;
377             int m;
378             int shift;
379             png_uint_32 i;
381             sp = png_ptr->row_buf + 1;
382             dp = row;
383             m = 0x80;
384 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
385             if (png_ptr->transformations & PNG_PACKSWAP)
386             {
387                 s_start = 0;
388                 s_end = 7;
389                 s_inc = 1;
390             }
391             else
392 #endif
393             {
394                 s_start = 7;
395                 s_end = 0;
396                 s_inc = -1;
397             }
399             shift = s_start;
401             for (i = 0; i < png_ptr->width; i++)
402             {
403                if (m & mask)
404                {
405                   int value;
407                   value = (*sp >> shift) & 0x1;
408                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
409                   *dp |= (png_byte)(value << shift);
410                }
412                if (shift == s_end)
413                {
414                   shift = s_start;
415                   sp++;
416                   dp++;
417                }
418                else
419                   shift += s_inc;
421                if (m == 1)
422                   m = 0x80;
423                else
424                   m >>= 1;
425             }
426             break;
427          }
429          case 2:        // png_ptr->row_info.pixel_depth
430          {
431             png_bytep sp;
432             png_bytep dp;
433             int s_start, s_end, s_inc;
434             int m;
435             int shift;
436             png_uint_32 i;
437             int value;
439             sp = png_ptr->row_buf + 1;
440             dp = row;
441             m = 0x80;
442 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
443             if (png_ptr->transformations & PNG_PACKSWAP)
444             {
445                s_start = 0;
446                s_end = 6;
447                s_inc = 2;
448             }
449             else
450 #endif
451             {
452                s_start = 6;
453                s_end = 0;
454                s_inc = -2;
455             }
457             shift = s_start;
459             for (i = 0; i < png_ptr->width; i++)
460             {
461                if (m & mask)
462                {
463                   value = (*sp >> shift) & 0x3;
464                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
465                   *dp |= (png_byte)(value << shift);
466                }
468                if (shift == s_end)
469                {
470                   shift = s_start;
471                   sp++;
472                   dp++;
473                }
474                else
475                   shift += s_inc;
476                if (m == 1)
477                   m = 0x80;
478                else
479                   m >>= 1;
480             }
481             break;
482          }
484          case 4:        // png_ptr->row_info.pixel_depth
485          {
486             png_bytep sp;
487             png_bytep dp;
488             int s_start, s_end, s_inc;
489             int m;
490             int shift;
491             png_uint_32 i;
492             int value;
494             sp = png_ptr->row_buf + 1;
495             dp = row;
496             m = 0x80;
497 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
498             if (png_ptr->transformations & PNG_PACKSWAP)
499             {
500                s_start = 0;
501                s_end = 4;
502                s_inc = 4;
503             }
504             else
505 #endif
506             {
507                s_start = 4;
508                s_end = 0;
509                s_inc = -4;
510             }
511             shift = s_start;
513             for (i = 0; i < png_ptr->width; i++)
514             {
515                if (m & mask)
516                {
517                   value = (*sp >> shift) & 0xf;
518                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
519                   *dp |= (png_byte)(value << shift);
520                }
522                if (shift == s_end)
523                {
524                   shift = s_start;
525                   sp++;
526                   dp++;
527                }
528                else
529                   shift += s_inc;
530                if (m == 1)
531                   m = 0x80;
532                else
533                   m >>= 1;
534             }
535             break;
536          }
538          case 8:        // png_ptr->row_info.pixel_depth
539          {
540             png_bytep srcptr;
541             png_bytep dstptr;
543             if ( _mmx_supported  )
544             {
545                png_uint_32 len;
546                int diff;
547                int dummy_value_a;   // fix 'forbidden register spilled' error
548                int dummy_value_d;
549                int dummy_value_c;
550                int dummy_value_S;
551                int dummy_value_D;
552                _unmask = ~mask;            // global variable for -fPIC version
553                srcptr = png_ptr->row_buf + 1;
554                dstptr = row;
555                len  = png_ptr->width &~7;  // reduce to multiple of 8
556                diff = png_ptr->width & 7;  // amount lost
558                __asm__ __volatile__ (
559                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
560                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
561                   "punpcklbw %%mm7, %%mm7    \n\t"
562                   "punpcklwd %%mm7, %%mm7    \n\t"
563                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
565                   "movq      _mask8_0, %%mm0 \n\t"
566                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
567                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
569 // preload        "movl      len, %%ecx      \n\t" // load length of line
570 // preload        "movl      srcptr, %%esi   \n\t" // load source
571 // preload        "movl      dstptr, %%edi   \n\t" // load dest
573                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
574                   "je        mainloop8end    \n\t"
576                 "mainloop8:                  \n\t"
577                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
578                   "pand      %%mm0, %%mm4    \n\t"
579                   "movq      %%mm0, %%mm6    \n\t"
580                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
581                   "por       %%mm6, %%mm4    \n\t"
582                   "movq      %%mm4, (%%edi)  \n\t"
583                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
584                   "addl      $8, %%edi       \n\t"
585                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
586                   "ja        mainloop8       \n\t"
588                 "mainloop8end:               \n\t"
589 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
590                   "movl      %%eax, %%ecx    \n\t"
591                   "cmpl      $0, %%ecx       \n\t"
592                   "jz        end8            \n\t"
593 // preload        "movl      mask, %%edx     \n\t"
594                   "sall      $24, %%edx      \n\t" // make low byte, high byte
596                 "secondloop8:                \n\t"
597                   "sall      %%edx           \n\t" // move high bit to CF
598                   "jnc       skip8           \n\t" // if CF = 0
599                   "movb      (%%esi), %%al   \n\t"
600                   "movb      %%al, (%%edi)   \n\t"
602                 "skip8:                      \n\t"
603                   "incl      %%esi           \n\t"
604                   "incl      %%edi           \n\t"
605                   "decl      %%ecx           \n\t"
606                   "jnz       secondloop8     \n\t"
608                 "end8:                       \n\t"
609                   "EMMS                      \n\t"  // DONE
611                   : "=a" (dummy_value_a),           // output regs (dummy)
612                     "=d" (dummy_value_d),
613                     "=c" (dummy_value_c),
614                     "=S" (dummy_value_S),
615                     "=D" (dummy_value_D)
617                   : "3" (srcptr),      // esi       // input regs
618                     "4" (dstptr),      // edi
619                     "0" (diff),        // eax
620 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
621                     "2" (len),         // ecx
622                     "1" (mask)         // edx
624 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
625                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
626 #endif
627                );
628             }
629             else /* mmx _not supported - Use modified C routine */
630             {
631                register png_uint_32 i;
632                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
633                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
634                register int stride = png_pass_inc[png_ptr->pass];
635                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
636                register int rep_bytes = png_pass_width[png_ptr->pass];
637                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
638                register png_uint_32 final_val = png_ptr->width;
640                srcptr = png_ptr->row_buf + 1 + initial_val;
641                dstptr = row + initial_val;
643                for (i = initial_val; i < final_val; i += stride)
644                {
645                   png_memcpy(dstptr, srcptr, rep_bytes);
646                   srcptr += stride;
647                   dstptr += stride;
648                }
649             } /* end of else */
651             break;
652          }       // end 8 bpp
654          case 16:       // png_ptr->row_info.pixel_depth
655          {
656             png_bytep srcptr;
657             png_bytep dstptr;
659             if ( _mmx_supported )
660             {
661                png_uint_32 len;
662                int diff;
663                int dummy_value_a;   // fix 'forbidden register spilled' error
664                int dummy_value_d;
665                int dummy_value_c;
666                int dummy_value_S;
667                int dummy_value_D;
668                _unmask = ~mask;            // global variable for -fPIC version
669                srcptr = png_ptr->row_buf + 1;
670                dstptr = row;
671                len  = png_ptr->width &~7;  // reduce to multiple of 8
672                diff = png_ptr->width & 7;  // amount lost
674                __asm__ __volatile__ (
675                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
676                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
677                   "punpcklbw %%mm7, %%mm7     \n\t"
678                   "punpcklwd %%mm7, %%mm7     \n\t"
679                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
681                   "movq      _mask16_0, %%mm0 \n\t"
682                   "movq      _mask16_1, %%mm1 \n\t"
684                   "pand      %%mm7, %%mm0     \n\t"
685                   "pand      %%mm7, %%mm1     \n\t"
687                   "pcmpeqb   %%mm6, %%mm0     \n\t"
688                   "pcmpeqb   %%mm6, %%mm1     \n\t"
690 // preload        "movl      len, %%ecx       \n\t" // load length of line
691 // preload        "movl      srcptr, %%esi    \n\t" // load source
692 // preload        "movl      dstptr, %%edi    \n\t" // load dest
694                   "cmpl      $0, %%ecx        \n\t"
695                   "jz        mainloop16end    \n\t"
697                 "mainloop16:                  \n\t"
698                   "movq      (%%esi), %%mm4   \n\t"
699                   "pand      %%mm0, %%mm4     \n\t"
700                   "movq      %%mm0, %%mm6     \n\t"
701                   "movq      (%%edi), %%mm7   \n\t"
702                   "pandn     %%mm7, %%mm6     \n\t"
703                   "por       %%mm6, %%mm4     \n\t"
704                   "movq      %%mm4, (%%edi)   \n\t"
706                   "movq      8(%%esi), %%mm5  \n\t"
707                   "pand      %%mm1, %%mm5     \n\t"
708                   "movq      %%mm1, %%mm7     \n\t"
709                   "movq      8(%%edi), %%mm6  \n\t"
710                   "pandn     %%mm6, %%mm7     \n\t"
711                   "por       %%mm7, %%mm5     \n\t"
712                   "movq      %%mm5, 8(%%edi)  \n\t"
714                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
715                   "addl      $16, %%edi       \n\t"
716                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
717                   "ja        mainloop16       \n\t"
719                 "mainloop16end:               \n\t"
720 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
721                   "movl      %%eax, %%ecx     \n\t"
722                   "cmpl      $0, %%ecx        \n\t"
723                   "jz        end16            \n\t"
724 // preload        "movl      mask, %%edx      \n\t"
725                   "sall      $24, %%edx       \n\t" // make low byte, high byte
727                 "secondloop16:                \n\t"
728                   "sall      %%edx            \n\t" // move high bit to CF
729                   "jnc       skip16           \n\t" // if CF = 0
730                   "movw      (%%esi), %%ax    \n\t"
731                   "movw      %%ax, (%%edi)    \n\t"
733                 "skip16:                      \n\t"
734                   "addl      $2, %%esi        \n\t"
735                   "addl      $2, %%edi        \n\t"
736                   "decl      %%ecx            \n\t"
737                   "jnz       secondloop16     \n\t"
739                 "end16:                       \n\t"
740                   "EMMS                       \n\t" // DONE
742                   : "=a" (dummy_value_a),           // output regs (dummy)
743                     "=c" (dummy_value_c),
744                     "=d" (dummy_value_d),
745                     "=S" (dummy_value_S),
746                     "=D" (dummy_value_D)
748                   : "0" (diff),        // eax       // input regs
749 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
750                     "1" (len),         // ecx
751                     "2" (mask),        // edx
752                     "3" (srcptr),      // esi
753                     "4" (dstptr)       // edi
755 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
756                   : "%mm0", "%mm1", "%mm4"          // clobber list
757                   , "%mm5", "%mm6", "%mm7"
758 #endif
759                );
760             }
761             else /* mmx _not supported - Use modified C routine */
762             {
763                register png_uint_32 i;
764                png_uint_32 initial_val = 2 * png_pass_start[png_ptr->pass];
765                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
766                register int stride = 2 * png_pass_inc[png_ptr->pass];
767                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
768                register int rep_bytes = 2 * png_pass_width[png_ptr->pass];
769                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
770                register png_uint_32 final_val = 2 * png_ptr->width;
772                srcptr = png_ptr->row_buf + 1 + initial_val;
773                dstptr = row + initial_val;
775                for (i = initial_val; i < final_val; i += stride)
776                {
777                   png_memcpy(dstptr, srcptr, rep_bytes);
778                   srcptr += stride;
779                   dstptr += stride;
780                }
781             } /* end of else */
783             break;
784          }       // end 16 bpp
786          case 24:       // png_ptr->row_info.pixel_depth
787          {
788             png_bytep srcptr;
789             png_bytep dstptr;
791             if ( _mmx_supported )
792             {
793                png_uint_32 len;
794                int diff;
795                int dummy_value_a;   // fix 'forbidden register spilled' error
796                int dummy_value_d;
797                int dummy_value_c;
798                int dummy_value_S;
799                int dummy_value_D;
800                _unmask = ~mask;            // global variable for -fPIC version
801                srcptr = png_ptr->row_buf + 1;
802                dstptr = row;
803                len  = png_ptr->width &~7;  // reduce to multiple of 8
804                diff = png_ptr->width & 7;  // amount lost
806                __asm__ __volatile__ (
807                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
808                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
809                   "punpcklbw %%mm7, %%mm7     \n\t"
810                   "punpcklwd %%mm7, %%mm7     \n\t"
811                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
813                   "movq      _mask24_0, %%mm0 \n\t"
814                   "movq      _mask24_1, %%mm1 \n\t"
815                   "movq      _mask24_2, %%mm2 \n\t"
817                   "pand      %%mm7, %%mm0     \n\t"
818                   "pand      %%mm7, %%mm1     \n\t"
819                   "pand      %%mm7, %%mm2     \n\t"
821                   "pcmpeqb   %%mm6, %%mm0     \n\t"
822                   "pcmpeqb   %%mm6, %%mm1     \n\t"
823                   "pcmpeqb   %%mm6, %%mm2     \n\t"
825 // preload        "movl      len, %%ecx       \n\t" // load length of line
826 // preload        "movl      srcptr, %%esi    \n\t" // load source
827 // preload        "movl      dstptr, %%edi    \n\t" // load dest
829                   "cmpl      $0, %%ecx        \n\t"
830                   "jz        mainloop24end    \n\t"
832                 "mainloop24:                  \n\t"
833                   "movq      (%%esi), %%mm4   \n\t"
834                   "pand      %%mm0, %%mm4     \n\t"
835                   "movq      %%mm0, %%mm6     \n\t"
836                   "movq      (%%edi), %%mm7   \n\t"
837                   "pandn     %%mm7, %%mm6     \n\t"
838                   "por       %%mm6, %%mm4     \n\t"
839                   "movq      %%mm4, (%%edi)   \n\t"
841                   "movq      8(%%esi), %%mm5  \n\t"
842                   "pand      %%mm1, %%mm5     \n\t"
843                   "movq      %%mm1, %%mm7     \n\t"
844                   "movq      8(%%edi), %%mm6  \n\t"
845                   "pandn     %%mm6, %%mm7     \n\t"
846                   "por       %%mm7, %%mm5     \n\t"
847                   "movq      %%mm5, 8(%%edi)  \n\t"
849                   "movq      16(%%esi), %%mm6 \n\t"
850                   "pand      %%mm2, %%mm6     \n\t"
851                   "movq      %%mm2, %%mm4     \n\t"
852                   "movq      16(%%edi), %%mm7 \n\t"
853                   "pandn     %%mm7, %%mm4     \n\t"
854                   "por       %%mm4, %%mm6     \n\t"
855                   "movq      %%mm6, 16(%%edi) \n\t"
857                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
858                   "addl      $24, %%edi       \n\t"
859                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
861                   "ja        mainloop24       \n\t"
863                 "mainloop24end:               \n\t"
864 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
865                   "movl      %%eax, %%ecx     \n\t"
866                   "cmpl      $0, %%ecx        \n\t"
867                   "jz        end24            \n\t"
868 // preload        "movl      mask, %%edx      \n\t"
869                   "sall      $24, %%edx       \n\t" // make low byte, high byte
871                 "secondloop24:                \n\t"
872                   "sall      %%edx            \n\t" // move high bit to CF
873                   "jnc       skip24           \n\t" // if CF = 0
874                   "movw      (%%esi), %%ax    \n\t"
875                   "movw      %%ax, (%%edi)    \n\t"
876                   "xorl      %%eax, %%eax     \n\t"
877                   "movb      2(%%esi), %%al   \n\t"
878                   "movb      %%al, 2(%%edi)   \n\t"
880                 "skip24:                      \n\t"
881                   "addl      $3, %%esi        \n\t"
882                   "addl      $3, %%edi        \n\t"
883                   "decl      %%ecx            \n\t"
884                   "jnz       secondloop24     \n\t"
886                 "end24:                       \n\t"
887                   "EMMS                       \n\t" // DONE
889                   : "=a" (dummy_value_a),           // output regs (dummy)
890                     "=d" (dummy_value_d),
891                     "=c" (dummy_value_c),
892                     "=S" (dummy_value_S),
893                     "=D" (dummy_value_D)
895                   : "3" (srcptr),      // esi       // input regs
896                     "4" (dstptr),      // edi
897                     "0" (diff),        // eax
898 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
899                     "2" (len),         // ecx
900                     "1" (mask)         // edx
902 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
903                   : "%mm0", "%mm1", "%mm2"          // clobber list
904                   , "%mm4", "%mm5", "%mm6", "%mm7"
905 #endif
906                );
907             }
908             else /* mmx _not supported - Use modified C routine */
909             {
910                register png_uint_32 i;
911                png_uint_32 initial_val = 3 * png_pass_start[png_ptr->pass];
912                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
913                register int stride = 3 * png_pass_inc[png_ptr->pass];
914                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
915                register int rep_bytes = 3 * png_pass_width[png_ptr->pass];
916                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
917                register png_uint_32 final_val = 3 * png_ptr->width;
919                srcptr = png_ptr->row_buf + 1 + initial_val;
920                dstptr = row + initial_val;
922                for (i = initial_val; i < final_val; i += stride)
923                {
924                   png_memcpy(dstptr, srcptr, rep_bytes);
925                   srcptr += stride;
926                   dstptr += stride;
927                }
928             } /* end of else */
930             break;
931          }       // end 24 bpp
933          case 32:       // png_ptr->row_info.pixel_depth
934          {
935             png_bytep srcptr;
936             png_bytep dstptr;
938             if ( _mmx_supported )
939             {
940                png_uint_32 len;
941                int diff;
942                int dummy_value_a;   // fix 'forbidden register spilled' error
943                int dummy_value_d;
944                int dummy_value_c;
945                int dummy_value_S;
946                int dummy_value_D;
947                _unmask = ~mask;            // global variable for -fPIC version
948                srcptr = png_ptr->row_buf + 1;
949                dstptr = row;
950                len  = png_ptr->width &~7;  // reduce to multiple of 8
951                diff = png_ptr->width & 7;  // amount lost
953                __asm__ __volatile__ (
954                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
955                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
956                   "punpcklbw %%mm7, %%mm7     \n\t"
957                   "punpcklwd %%mm7, %%mm7     \n\t"
958                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
960                   "movq      _mask32_0, %%mm0 \n\t"
961                   "movq      _mask32_1, %%mm1 \n\t"
962                   "movq      _mask32_2, %%mm2 \n\t"
963                   "movq      _mask32_3, %%mm3 \n\t"
965                   "pand      %%mm7, %%mm0     \n\t"
966                   "pand      %%mm7, %%mm1     \n\t"
967                   "pand      %%mm7, %%mm2     \n\t"
968                   "pand      %%mm7, %%mm3     \n\t"
970                   "pcmpeqb   %%mm6, %%mm0     \n\t"
971                   "pcmpeqb   %%mm6, %%mm1     \n\t"
972                   "pcmpeqb   %%mm6, %%mm2     \n\t"
973                   "pcmpeqb   %%mm6, %%mm3     \n\t"
975 // preload        "movl      len, %%ecx       \n\t" // load length of line
976 // preload        "movl      srcptr, %%esi    \n\t" // load source
977 // preload        "movl      dstptr, %%edi    \n\t" // load dest
979                   "cmpl      $0, %%ecx        \n\t" // lcr
980                   "jz        mainloop32end    \n\t"
982                 "mainloop32:                  \n\t"
983                   "movq      (%%esi), %%mm4   \n\t"
984                   "pand      %%mm0, %%mm4     \n\t"
985                   "movq      %%mm0, %%mm6     \n\t"
986                   "movq      (%%edi), %%mm7   \n\t"
987                   "pandn     %%mm7, %%mm6     \n\t"
988                   "por       %%mm6, %%mm4     \n\t"
989                   "movq      %%mm4, (%%edi)   \n\t"
991                   "movq      8(%%esi), %%mm5  \n\t"
992                   "pand      %%mm1, %%mm5     \n\t"
993                   "movq      %%mm1, %%mm7     \n\t"
994                   "movq      8(%%edi), %%mm6  \n\t"
995                   "pandn     %%mm6, %%mm7     \n\t"
996                   "por       %%mm7, %%mm5     \n\t"
997                   "movq      %%mm5, 8(%%edi)  \n\t"
999                   "movq      16(%%esi), %%mm6 \n\t"
1000                   "pand      %%mm2, %%mm6     \n\t"
1001                   "movq      %%mm2, %%mm4     \n\t"
1002                   "movq      16(%%edi), %%mm7 \n\t"
1003                   "pandn     %%mm7, %%mm4     \n\t"
1004                   "por       %%mm4, %%mm6     \n\t"
1005                   "movq      %%mm6, 16(%%edi) \n\t"
1007                   "movq      24(%%esi), %%mm7 \n\t"
1008                   "pand      %%mm3, %%mm7     \n\t"
1009                   "movq      %%mm3, %%mm5     \n\t"
1010                   "movq      24(%%edi), %%mm4 \n\t"
1011                   "pandn     %%mm4, %%mm5     \n\t"
1012                   "por       %%mm5, %%mm7     \n\t"
1013                   "movq      %%mm7, 24(%%edi) \n\t"
1015                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
1016                   "addl      $32, %%edi       \n\t"
1017                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1018                   "ja        mainloop32       \n\t"
1020                 "mainloop32end:               \n\t"
1021 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1022                   "movl      %%eax, %%ecx     \n\t"
1023                   "cmpl      $0, %%ecx        \n\t"
1024                   "jz        end32            \n\t"
1025 // preload        "movl      mask, %%edx      \n\t"
1026                   "sall      $24, %%edx       \n\t" // low byte => high byte
1028                 "secondloop32:                \n\t"
1029                   "sall      %%edx            \n\t" // move high bit to CF
1030                   "jnc       skip32           \n\t" // if CF = 0
1031                   "movl      (%%esi), %%eax   \n\t"
1032                   "movl      %%eax, (%%edi)   \n\t"
1034                 "skip32:                      \n\t"
1035                   "addl      $4, %%esi        \n\t"
1036                   "addl      $4, %%edi        \n\t"
1037                   "decl      %%ecx            \n\t"
1038                   "jnz       secondloop32     \n\t"
1040                 "end32:                       \n\t"
1041                   "EMMS                       \n\t" // DONE
1043                   : "=a" (dummy_value_a),           // output regs (dummy)
1044                     "=d" (dummy_value_d),
1045                     "=c" (dummy_value_c),
1046                     "=S" (dummy_value_S),
1047                     "=D" (dummy_value_D)
1049                   : "3" (srcptr),      // esi       // input regs
1050                     "4" (dstptr),      // edi
1051                     "0" (diff),        // eax
1052 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1053                     "2" (len),         // ecx
1054                     "1" (mask)         // edx
1056 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1057                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1058                   , "%mm4", "%mm5", "%mm6", "%mm7"
1059 #endif
1060                );
1061             }
1062             else /* mmx _not supported - Use modified C routine */
1063             {
1064                register png_uint_32 i;
1065                png_uint_32 initial_val = 4 * png_pass_start[png_ptr->pass];
1066                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1067                register int stride = 4 * png_pass_inc[png_ptr->pass];
1068                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1069                register int rep_bytes = 4 * png_pass_width[png_ptr->pass];
1070                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1071                register png_uint_32 final_val = 4 * png_ptr->width;
1073                srcptr = png_ptr->row_buf + 1 + initial_val;
1074                dstptr = row + initial_val;
1076                for (i = initial_val; i < final_val; i += stride)
1077                {
1078                   png_memcpy(dstptr, srcptr, rep_bytes);
1079                   srcptr += stride;
1080                   dstptr += stride;
1081                }
1082             } /* end of else */
1084             break;
1085          }       // end 32 bpp
1087          case 48:       // png_ptr->row_info.pixel_depth
1088          {
1089             png_bytep srcptr;
1090             png_bytep dstptr;
1092             if ( _mmx_supported )
1093             {
1094                png_uint_32 len;
1095                int diff;
1096                int dummy_value_a;   // fix 'forbidden register spilled' error
1097                int dummy_value_d;
1098                int dummy_value_c;
1099                int dummy_value_S;
1100                int dummy_value_D;
1101                _unmask = ~mask;            // global variable for -fPIC version
1102                srcptr = png_ptr->row_buf + 1;
1103                dstptr = row;
1104                len  = png_ptr->width &~7;  // reduce to multiple of 8
1105                diff = png_ptr->width & 7;  // amount lost
1107                __asm__ __volatile__ (
1108                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1109                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1110                   "punpcklbw %%mm7, %%mm7     \n\t"
1111                   "punpcklwd %%mm7, %%mm7     \n\t"
1112                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1114                   "movq      _mask48_0, %%mm0 \n\t"
1115                   "movq      _mask48_1, %%mm1 \n\t"
1116                   "movq      _mask48_2, %%mm2 \n\t"
1117                   "movq      _mask48_3, %%mm3 \n\t"
1118                   "movq      _mask48_4, %%mm4 \n\t"
1119                   "movq      _mask48_5, %%mm5 \n\t"
1121                   "pand      %%mm7, %%mm0     \n\t"
1122                   "pand      %%mm7, %%mm1     \n\t"
1123                   "pand      %%mm7, %%mm2     \n\t"
1124                   "pand      %%mm7, %%mm3     \n\t"
1125                   "pand      %%mm7, %%mm4     \n\t"
1126                   "pand      %%mm7, %%mm5     \n\t"
1128                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1129                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1130                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1131                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1132                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1133                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1135 // preload        "movl      len, %%ecx       \n\t" // load length of line
1136 // preload        "movl      srcptr, %%esi    \n\t" // load source
1137 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1139                   "cmpl      $0, %%ecx        \n\t"
1140                   "jz        mainloop48end    \n\t"
1142                 "mainloop48:                  \n\t"
1143                   "movq      (%%esi), %%mm7   \n\t"
1144                   "pand      %%mm0, %%mm7     \n\t"
1145                   "movq      %%mm0, %%mm6     \n\t"
1146                   "pandn     (%%edi), %%mm6   \n\t"
1147                   "por       %%mm6, %%mm7     \n\t"
1148                   "movq      %%mm7, (%%edi)   \n\t"
1150                   "movq      8(%%esi), %%mm6  \n\t"
1151                   "pand      %%mm1, %%mm6     \n\t"
1152                   "movq      %%mm1, %%mm7     \n\t"
1153                   "pandn     8(%%edi), %%mm7  \n\t"
1154                   "por       %%mm7, %%mm6     \n\t"
1155                   "movq      %%mm6, 8(%%edi)  \n\t"
1157                   "movq      16(%%esi), %%mm6 \n\t"
1158                   "pand      %%mm2, %%mm6     \n\t"
1159                   "movq      %%mm2, %%mm7     \n\t"
1160                   "pandn     16(%%edi), %%mm7 \n\t"
1161                   "por       %%mm7, %%mm6     \n\t"
1162                   "movq      %%mm6, 16(%%edi) \n\t"
1164                   "movq      24(%%esi), %%mm7 \n\t"
1165                   "pand      %%mm3, %%mm7     \n\t"
1166                   "movq      %%mm3, %%mm6     \n\t"
1167                   "pandn     24(%%edi), %%mm6 \n\t"
1168                   "por       %%mm6, %%mm7     \n\t"
1169                   "movq      %%mm7, 24(%%edi) \n\t"
1171                   "movq      32(%%esi), %%mm6 \n\t"
1172                   "pand      %%mm4, %%mm6     \n\t"
1173                   "movq      %%mm4, %%mm7     \n\t"
1174                   "pandn     32(%%edi), %%mm7 \n\t"
1175                   "por       %%mm7, %%mm6     \n\t"
1176                   "movq      %%mm6, 32(%%edi) \n\t"
1178                   "movq      40(%%esi), %%mm7 \n\t"
1179                   "pand      %%mm5, %%mm7     \n\t"
1180                   "movq      %%mm5, %%mm6     \n\t"
1181                   "pandn     40(%%edi), %%mm6 \n\t"
1182                   "por       %%mm6, %%mm7     \n\t"
1183                   "movq      %%mm7, 40(%%edi) \n\t"
1185                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
1186                   "addl      $48, %%edi       \n\t"
1187                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1189                   "ja        mainloop48       \n\t"
1191                 "mainloop48end:               \n\t"
1192 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1193                   "movl      %%eax, %%ecx     \n\t"
1194                   "cmpl      $0, %%ecx        \n\t"
1195                   "jz        end48            \n\t"
1196 // preload        "movl      mask, %%edx      \n\t"
1197                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1199                 "secondloop48:                \n\t"
1200                   "sall      %%edx            \n\t" // move high bit to CF
1201                   "jnc       skip48           \n\t" // if CF = 0
1202                   "movl      (%%esi), %%eax   \n\t"
1203                   "movl      %%eax, (%%edi)   \n\t"
1205                 "skip48:                      \n\t"
1206                   "addl      $4, %%esi        \n\t"
1207                   "addl      $4, %%edi        \n\t"
1208                   "decl      %%ecx            \n\t"
1209                   "jnz       secondloop48     \n\t"
1211                 "end48:                       \n\t"
1212                   "EMMS                       \n\t" // DONE
1214                   : "=a" (dummy_value_a),           // output regs (dummy)
1215                     "=d" (dummy_value_d),
1216                     "=c" (dummy_value_c),
1217                     "=S" (dummy_value_S),
1218                     "=D" (dummy_value_D)
1220                   : "3" (srcptr),      // esi       // input regs
1221                     "4" (dstptr),      // edi
1222                     "0" (diff),        // eax
1223 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1224                     "2" (len),         // ecx
1225                     "1" (mask)         // edx
1227 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1228                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1229                   , "%mm4", "%mm5", "%mm6", "%mm7"
1230 #endif
1231                );
1232             }
1233             else /* mmx _not supported - Use modified C routine */
1234             {
1235                register png_uint_32 i;
1236                png_uint_32 initial_val = 6 * png_pass_start[png_ptr->pass];
1237                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1238                register int stride = 6 * png_pass_inc[png_ptr->pass];
1239                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1240                register int rep_bytes = 6 * png_pass_width[png_ptr->pass];
1241                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1242                register png_uint_32 final_val = 6 * png_ptr->width;
1244                srcptr = png_ptr->row_buf + 1 + initial_val;
1245                dstptr = row + initial_val;
1247                for (i = initial_val; i < final_val; i += stride)
1248                {
1249                   png_memcpy(dstptr, srcptr, rep_bytes);
1250                   srcptr += stride;
1251                   dstptr += stride;
1252                }
1253             } /* end of else */
1255             break;
1256          }       // end 48 bpp
1258          case 64:       // png_ptr->row_info.pixel_depth
1259          {
1260             png_bytep srcptr;
1261             png_bytep dstptr;
1262             register png_uint_32 i;
1263             png_uint_32 initial_val = 8 * png_pass_start[png_ptr->pass];
1264               // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1265             register int stride = 8 * png_pass_inc[png_ptr->pass];
1266               // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1267             register int rep_bytes = 8 * png_pass_width[png_ptr->pass];
1268               // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1269             register png_uint_32 final_val = 8 * png_ptr->width;
1271             srcptr = png_ptr->row_buf + 1 + initial_val;
1272             dstptr = row + initial_val;
1274             for (i = initial_val; i < final_val; i += stride)
1275             {
1276                png_memcpy(dstptr, srcptr, rep_bytes);
1277                srcptr += stride;
1278                dstptr += stride;
1279             }
1280             break;
1281          }       // end 64 bpp
1283          default:   // png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64
1284          {
1285             // this should never happen
1286             fprintf(stderr,
1287               "libpng internal error:  png_ptr->row_info.pixel_depth = %d\n",
1288               png_ptr->row_info.pixel_depth);
1289             fflush(stderr);
1290             break;
1291          }
1292       } /* end switch (png_ptr->row_info.pixel_depth) */
1294    } /* end if (non-trivial mask) */
1296 } /* end png_combine_row() */
1298 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1303 //===========================================================================//
1304 //                                                                           //
1305 //                 P N G _ D O _ R E A D _ I N T E R L A C E                 //
1306 //                                                                           //
1307 //===========================================================================//
1309 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1310 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1312 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1313  * has taken place.  [GRR: what other steps come before and/or after?]
1314  */
1316 void /* PRIVATE */
1317 png_do_read_interlace(png_structp png_ptr)
1319    png_row_infop row_info = &(png_ptr->row_info);
1320    png_bytep row = png_ptr->row_buf + 1;
1321    int pass = png_ptr->pass;
1322    png_uint_32 transformations = png_ptr->transformations;
1324    png_debug(1,"in png_do_read_interlace\n");
1326    if (_mmx_supported == 2) {
1327        png_mmx_support();
1328    }
1330    if (row != NULL && row_info != NULL)
1331    {
1332       png_uint_32 final_width;
1334       final_width = row_info->width * png_pass_inc[pass];
1336       switch (row_info->pixel_depth)
1337       {
1338          case 1:
1339          {
1340             png_bytep sp, dp;
1341             int sshift, dshift;
1342             int s_start, s_end, s_inc;
1343             png_byte v;
1344             png_uint_32 i;
1345             int j;
1347             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1348             dp = row + (png_size_t)((final_width - 1) >> 3);
1349 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1350             if (transformations & PNG_PACKSWAP)
1351             {
1352                sshift = (int)((row_info->width + 7) & 7);
1353                dshift = (int)((final_width + 7) & 7);
1354                s_start = 7;
1355                s_end = 0;
1356                s_inc = -1;
1357             }
1358             else
1359 #endif
1360             {
1361                sshift = 7 - (int)((row_info->width + 7) & 7);
1362                dshift = 7 - (int)((final_width + 7) & 7);
1363                s_start = 0;
1364                s_end = 7;
1365                s_inc = 1;
1366             }
1368             for (i = row_info->width; i; i--)
1369             {
1370                v = (png_byte)((*sp >> sshift) & 0x1);
1371                for (j = 0; j < png_pass_inc[pass]; j++)
1372                {
1373                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1374                   *dp |= (png_byte)(v << dshift);
1375                   if (dshift == s_end)
1376                   {
1377                      dshift = s_start;
1378                      dp--;
1379                   }
1380                   else
1381                      dshift += s_inc;
1382                }
1383                if (sshift == s_end)
1384                {
1385                   sshift = s_start;
1386                   sp--;
1387                }
1388                else
1389                   sshift += s_inc;
1390             }
1391             break;
1392          }
1394          case 2:
1395          {
1396             png_bytep sp, dp;
1397             int sshift, dshift;
1398             int s_start, s_end, s_inc;
1399             png_uint_32 i;
1401             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1402             dp = row + (png_size_t)((final_width - 1) >> 2);
1403 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1404             if (transformations & PNG_PACKSWAP)
1405             {
1406                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1407                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1408                s_start = 6;
1409                s_end = 0;
1410                s_inc = -2;
1411             }
1412             else
1413 #endif
1414             {
1415                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1416                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1417                s_start = 0;
1418                s_end = 6;
1419                s_inc = 2;
1420             }
1422             for (i = row_info->width; i; i--)
1423             {
1424                png_byte v;
1425                int j;
1427                v = (png_byte)((*sp >> sshift) & 0x3);
1428                for (j = 0; j < png_pass_inc[pass]; j++)
1429                {
1430                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1431                   *dp |= (png_byte)(v << dshift);
1432                   if (dshift == s_end)
1433                   {
1434                      dshift = s_start;
1435                      dp--;
1436                   }
1437                   else
1438                      dshift += s_inc;
1439                }
1440                if (sshift == s_end)
1441                {
1442                   sshift = s_start;
1443                   sp--;
1444                }
1445                else
1446                   sshift += s_inc;
1447             }
1448             break;
1449          }
1451          case 4:
1452          {
1453             png_bytep sp, dp;
1454             int sshift, dshift;
1455             int s_start, s_end, s_inc;
1456             png_uint_32 i;
1458             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1459             dp = row + (png_size_t)((final_width - 1) >> 1);
1460 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1461             if (transformations & PNG_PACKSWAP)
1462             {
1463                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1464                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1465                s_start = 4;
1466                s_end = 0;
1467                s_inc = -4;
1468             }
1469             else
1470 #endif
1471             {
1472                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1473                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1474                s_start = 0;
1475                s_end = 4;
1476                s_inc = 4;
1477             }
1479             for (i = row_info->width; i; i--)
1480             {
1481                png_byte v;
1482                int j;
1484                v = (png_byte)((*sp >> sshift) & 0xf);
1485                for (j = 0; j < png_pass_inc[pass]; j++)
1486                {
1487                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1488                   *dp |= (png_byte)(v << dshift);
1489                   if (dshift == s_end)
1490                   {
1491                      dshift = s_start;
1492                      dp--;
1493                   }
1494                   else
1495                      dshift += s_inc;
1496                }
1497                if (sshift == s_end)
1498                {
1499                   sshift = s_start;
1500                   sp--;
1501                }
1502                else
1503                   sshift += s_inc;
1504             }
1505             break;
1506          }
1508          //====================================================================
1510          default:  // 8-bit or larger (this is where the routine is modified)
1511          {
1512 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
1513 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
1514 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
1515 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
1516             png_bytep sptr, dp;
1517             png_uint_32 i;
1518             png_size_t pixel_bytes;
1519             int width = row_info->width;
1521             pixel_bytes = (row_info->pixel_depth >> 3);
1523             // point sptr at the last pixel in the pre-expanded row:
1524             sptr = row + (width - 1) * pixel_bytes;
1526             // point dp at the last pixel position in the expanded row:
1527             dp = row + (final_width - 1) * pixel_bytes;
1529             // New code by Nirav Chhatrapati - Intel Corporation
1531             if ( _mmx_supported )
1532             {
1533                //--------------------------------------------------------------
1534                if (pixel_bytes == 3)
1535                {
1536                   if (((pass == 0) || (pass == 1)) && width)
1537                   {
1538                      int dummy_value_c;   // fix 'forbidden register spilled'
1539                      int dummy_value_S;
1540                      int dummy_value_D;
1542                      __asm__ __volatile__ (
1543                         "subl $21, %%edi         \n\t"
1544                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1546                      ".loop3_pass0:              \n\t"
1547                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1548                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1549                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1550                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1551                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1552                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1553                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1554                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1555                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1556                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
1557                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
1558                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
1559                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
1560                         "movq %%mm4, 16(%%edi)   \n\t"
1561                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
1562                         "movq %%mm3, 8(%%edi)    \n\t"
1563                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
1564                         "subl $3, %%esi          \n\t"
1565                         "movq %%mm0, (%%edi)     \n\t"
1566                         "subl $24, %%edi         \n\t"
1567                         "decl %%ecx              \n\t"
1568                         "jnz .loop3_pass0        \n\t"
1569                         "EMMS                    \n\t" // DONE
1571                         : "=c" (dummy_value_c),        // output regs (dummy)
1572                           "=S" (dummy_value_S),
1573                           "=D" (dummy_value_D)
1575                         : "1" (sptr),      // esi      // input regs
1576                           "2" (dp),        // edi
1577                           "0" (width)      // ecx
1578 // doesn't work           "i" (0x0000000000FFFFFFLL)   // %1 (a.k.a. _const4)
1580 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1581                         : "%mm0", "%mm1", "%mm2"       // clobber list
1582                         , "%mm3", "%mm4"
1583 #endif
1584                      );
1585                   }
1586                   else if (((pass == 2) || (pass == 3)) && width)
1587                   {
1588                      int dummy_value_c;   // fix 'forbidden register spilled'
1589                      int dummy_value_S;
1590                      int dummy_value_D;
1592                      __asm__ __volatile__ (
1593                         "subl $9, %%edi          \n\t"
1594                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1596                      ".loop3_pass2:              \n\t"
1597                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1598                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1599                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1600                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1601                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1602                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1603                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1604                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1605                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1606                         "movq %%mm0, 4(%%edi)    \n\t"
1607                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
1608                         "subl $3, %%esi          \n\t"
1609                         "movd %%mm0, (%%edi)     \n\t"
1610                         "subl $12, %%edi         \n\t"
1611                         "decl %%ecx              \n\t"
1612                         "jnz .loop3_pass2        \n\t"
1613                         "EMMS                    \n\t" // DONE
1615                         : "=c" (dummy_value_c),        // output regs (dummy)
1616                           "=S" (dummy_value_S),
1617                           "=D" (dummy_value_D)
1619                         : "1" (sptr),      // esi      // input regs
1620                           "2" (dp),        // edi
1621                           "0" (width)      // ecx
1623 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1624                         : "%mm0", "%mm1", "%mm2"       // clobber list
1625 #endif
1626                      );
1627                   }
1628                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1629                   {
1630                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
1631                      if (width_mmx < 0)
1632                          width_mmx = 0;
1633                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1634                      if (width_mmx)
1635                      {
1636                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1637                         // sptr points at last pixel in pre-expanded row
1638                         // dp points at last pixel position in expanded row
1639                         int dummy_value_c;  // fix 'forbidden register spilled'
1640                         int dummy_value_S;
1641                         int dummy_value_D;
1643                         __asm__ __volatile__ (
1644                            "subl $3, %%esi          \n\t"
1645                            "subl $9, %%edi          \n\t"
1646                                         // (png_pass_inc[pass] + 1)*pixel_bytes
1648                         ".loop3_pass4:              \n\t"
1649                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
1650                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
1651                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
1652                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
1653                            "pand _const4, %%mm1     \n\t" // z z z z z 2 1 0
1654                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
1655                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
1656                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
1657                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
1658                            "movq %%mm0, (%%edi)     \n\t"
1659                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
1660                            "pand _const6, %%mm3     \n\t" // z z z z z z z 5
1661                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
1662                            "subl $6, %%esi          \n\t"
1663                            "movd %%mm2, 8(%%edi)    \n\t"
1664                            "subl $12, %%edi         \n\t"
1665                            "subl $2, %%ecx          \n\t"
1666                            "jnz .loop3_pass4        \n\t"
1667                            "EMMS                    \n\t" // DONE
1669                            : "=c" (dummy_value_c),        // output regs (dummy)
1670                              "=S" (dummy_value_S),
1671                              "=D" (dummy_value_D)
1673                            : "1" (sptr),      // esi      // input regs
1674                              "2" (dp),        // edi
1675                              "0" (width_mmx)  // ecx
1677 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1678                            : "%mm0", "%mm1"               // clobber list
1679                            , "%mm2", "%mm3"
1680 #endif
1681                         );
1682                      }
1684                      sptr -= width_mmx*3;
1685                      dp -= width_mmx*6;
1686                      for (i = width; i; i--)
1687                      {
1688                         png_byte v[8];
1689                         int j;
1691                         png_memcpy(v, sptr, 3);
1692                         for (j = 0; j < png_pass_inc[pass]; j++)
1693                         {
1694                            png_memcpy(dp, v, 3);
1695                            dp -= 3;
1696                         }
1697                         sptr -= 3;
1698                      }
1699                   }
1700                } /* end of pixel_bytes == 3 */
1702                //--------------------------------------------------------------
1703                else if (pixel_bytes == 1)
1704                {
1705                   if (((pass == 0) || (pass == 1)) && width)
1706                   {
1707                      int width_mmx = ((width >> 2) << 2);
1708                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1709                      if (width_mmx)
1710                      {
1711                         int dummy_value_c;  // fix 'forbidden register spilled'
1712                         int dummy_value_S;
1713                         int dummy_value_D;
1715                         __asm__ __volatile__ (
1716                            "subl $3, %%esi          \n\t"
1717                            "subl $31, %%edi         \n\t"
1719                         ".loop1_pass0:              \n\t"
1720                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1721                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
1722                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1723                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
1724                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1725                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
1726                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
1727                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
1728                            "movq %%mm0, (%%edi)     \n\t"
1729                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
1730                            "movq %%mm3, 8(%%edi)    \n\t"
1731                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
1732                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
1733                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
1734                            "movq %%mm2, 16(%%edi)   \n\t"
1735                            "subl $4, %%esi          \n\t"
1736                            "movq %%mm4, 24(%%edi)   \n\t"
1737                            "subl $32, %%edi         \n\t"
1738                            "subl $4, %%ecx          \n\t"
1739                            "jnz .loop1_pass0        \n\t"
1740                            "EMMS                    \n\t" // DONE
1742                            : "=c" (dummy_value_c),        // output regs (dummy)
1743                              "=S" (dummy_value_S),
1744                              "=D" (dummy_value_D)
1746                            : "1" (sptr),      // esi      // input regs
1747                              "2" (dp),        // edi
1748                              "0" (width_mmx)  // ecx
1750 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1751                            : "%mm0", "%mm1", "%mm2"       // clobber list
1752                            , "%mm3", "%mm4"
1753 #endif
1754                         );
1755                      }
1757                      sptr -= width_mmx;
1758                      dp -= width_mmx*8;
1759                      for (i = width; i; i--)
1760                      {
1761                         int j;
1763                        /* I simplified this part in version 1.0.4e
1764                         * here and in several other instances where
1765                         * pixel_bytes == 1  -- GR-P
1766                         *
1767                         * Original code:
1768                         *
1769                         * png_byte v[8];
1770                         * png_memcpy(v, sptr, pixel_bytes);
1771                         * for (j = 0; j < png_pass_inc[pass]; j++)
1772                         * {
1773                         *    png_memcpy(dp, v, pixel_bytes);
1774                         *    dp -= pixel_bytes;
1775                         * }
1776                         * sptr -= pixel_bytes;
1777                         *
1778                         * Replacement code is in the next three lines:
1779                         */
1781                         for (j = 0; j < png_pass_inc[pass]; j++)
1782                            *dp-- = *sptr;
1783                         --sptr;
1784                      }
1785                   }
1786                   else if (((pass == 2) || (pass == 3)) && width)
1787                   {
1788                      int width_mmx = ((width >> 2) << 2);
1789                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1790                      if (width_mmx)
1791                      {
1792                         int dummy_value_c;  // fix 'forbidden register spilled'
1793                         int dummy_value_S;
1794                         int dummy_value_D;
1796                         __asm__ __volatile__ (
1797                            "subl $3, %%esi          \n\t"
1798                            "subl $15, %%edi         \n\t"
1800                         ".loop1_pass2:              \n\t"
1801                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1802                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1803                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
1804                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1805                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
1806                            "movq %%mm0, (%%edi)     \n\t"
1807                            "subl $4, %%esi          \n\t"
1808                            "movq %%mm1, 8(%%edi)    \n\t"
1809                            "subl $16, %%edi         \n\t"
1810                            "subl $4, %%ecx          \n\t"
1811                            "jnz .loop1_pass2        \n\t"
1812                            "EMMS                    \n\t" // DONE
1814                            : "=c" (dummy_value_c),        // output regs (dummy)
1815                              "=S" (dummy_value_S),
1816                              "=D" (dummy_value_D)
1818                            : "1" (sptr),      // esi      // input regs
1819                              "2" (dp),        // edi
1820                              "0" (width_mmx)  // ecx
1822 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1823                            : "%mm0", "%mm1"               // clobber list
1824 #endif
1825                         );
1826                      }
1828                      sptr -= width_mmx;
1829                      dp -= width_mmx*4;
1830                      for (i = width; i; i--)
1831                      {
1832                         int j;
1834                         for (j = 0; j < png_pass_inc[pass]; j++)
1835                            *dp-- = *sptr;
1836                         --sptr;
1837                      }
1838                   }
1839                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
1840                   {
1841                      int width_mmx = ((width >> 3) << 3);
1842                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1843                      if (width_mmx)
1844                      {
1845                         int dummy_value_c;  // fix 'forbidden register spilled'
1846                         int dummy_value_S;
1847                         int dummy_value_D;
1849                         __asm__ __volatile__ (
1850                            "subl $7, %%esi          \n\t"
1851                            "subl $15, %%edi         \n\t"
1853                         ".loop1_pass4:              \n\t"
1854                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
1855                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
1856                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1857                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
1858                            "movq %%mm1, 8(%%edi)    \n\t"
1859                            "subl $8, %%esi          \n\t"
1860                            "movq %%mm0, (%%edi)     \n\t"
1861                            "subl $16, %%edi         \n\t"
1862                            "subl $8, %%ecx          \n\t"
1863                            "jnz .loop1_pass4        \n\t"
1864                            "EMMS                    \n\t" // DONE
1866                            : "=c" (dummy_value_c),        // output regs (none)
1867                              "=S" (dummy_value_S),
1868                              "=D" (dummy_value_D)
1870                            : "1" (sptr),      // esi      // input regs
1871                              "2" (dp),        // edi
1872                              "0" (width_mmx)  // ecx
1874 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1875                            : "%mm0", "%mm1"               // clobber list
1876 #endif
1877                         );
1878                      }
1880                      sptr -= width_mmx;
1881                      dp -= width_mmx*2;
1882                      for (i = width; i; i--)
1883                      {
1884                         int j;
1886                         for (j = 0; j < png_pass_inc[pass]; j++)
1887                            *dp-- = *sptr;
1888                         --sptr;
1889                      }
1890                   }
1891                } /* end of pixel_bytes == 1 */
1893                //--------------------------------------------------------------
1894                else if (pixel_bytes == 2)
1895                {
1896                   if (((pass == 0) || (pass == 1)) && width)
1897                   {
1898                      int width_mmx = ((width >> 1) << 1);
1899                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
1900                      if (width_mmx)
1901                      {
1902                         int dummy_value_c;  // fix 'forbidden register spilled'
1903                         int dummy_value_S;
1904                         int dummy_value_D;
1906                         __asm__ __volatile__ (
1907                            "subl $2, %%esi          \n\t"
1908                            "subl $30, %%edi         \n\t"
1910                         ".loop2_pass0:              \n\t"
1911                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1912                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
1913                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
1914                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
1915                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
1916                            "movq %%mm0, (%%edi)     \n\t"
1917                            "movq %%mm0, 8(%%edi)    \n\t"
1918                            "movq %%mm1, 16(%%edi)   \n\t"
1919                            "subl $4, %%esi          \n\t"
1920                            "movq %%mm1, 24(%%edi)   \n\t"
1921                            "subl $32, %%edi         \n\t"
1922                            "subl $2, %%ecx          \n\t"
1923                            "jnz .loop2_pass0        \n\t"
1924                            "EMMS                    \n\t" // DONE
1926                            : "=c" (dummy_value_c),        // output regs (dummy)
1927                              "=S" (dummy_value_S),
1928                              "=D" (dummy_value_D)
1930                            : "1" (sptr),      // esi      // input regs
1931                              "2" (dp),        // edi
1932                              "0" (width_mmx)  // ecx
1934 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1935                            : "%mm0", "%mm1"               // clobber list
1936 #endif
1937                         );
1938                      }
1940                      sptr -= (width_mmx*2 - 2); // sign fixed
1941                      dp -= (width_mmx*16 - 2);  // sign fixed
1942                      for (i = width; i; i--)
1943                      {
1944                         png_byte v[8];
1945                         int j;
1946                         sptr -= 2;
1947                         png_memcpy(v, sptr, 2);
1948                         for (j = 0; j < png_pass_inc[pass]; j++)
1949                         {
1950                            dp -= 2;
1951                            png_memcpy(dp, v, 2);
1952                         }
1953                      }
1954                   }
1955                   else if (((pass == 2) || (pass == 3)) && width)
1956                   {
1957                      int width_mmx = ((width >> 1) << 1) ;
1958                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
1959                      if (width_mmx)
1960                      {
1961                         int dummy_value_c;  // fix 'forbidden register spilled'
1962                         int dummy_value_S;
1963                         int dummy_value_D;
1965                         __asm__ __volatile__ (
1966                            "subl $2, %%esi          \n\t"
1967                            "subl $14, %%edi         \n\t"
1969                         ".loop2_pass2:              \n\t"
1970                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1971                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
1972                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
1973                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
1974                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
1975                            "movq %%mm0, (%%edi)     \n\t"
1976                            "subl $4, %%esi          \n\t"
1977                            "movq %%mm1, 8(%%edi)    \n\t"
1978                            "subl $16, %%edi         \n\t"
1979                            "subl $2, %%ecx          \n\t"
1980                            "jnz .loop2_pass2        \n\t"
1981                            "EMMS                    \n\t" // DONE
1983                            : "=c" (dummy_value_c),        // output regs (dummy)
1984                              "=S" (dummy_value_S),
1985                              "=D" (dummy_value_D)
1987                            : "1" (sptr),      // esi      // input regs
1988                              "2" (dp),        // edi
1989                              "0" (width_mmx)  // ecx
1991 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1992                            : "%mm0", "%mm1"               // clobber list
1993 #endif
1994                         );
1995                      }
1997                      sptr -= (width_mmx*2 - 2); // sign fixed
1998                      dp -= (width_mmx*8 - 2);   // sign fixed
1999                      for (i = width; i; i--)
2000                      {
2001                         png_byte v[8];
2002                         int j;
2003                         sptr -= 2;
2004                         png_memcpy(v, sptr, 2);
2005                         for (j = 0; j < png_pass_inc[pass]; j++)
2006                         {
2007                            dp -= 2;
2008                            png_memcpy(dp, v, 2);
2009                         }
2010                      }
2011                   }
2012                   else if (width)  // pass == 4 or 5
2013                   {
2014                      int width_mmx = ((width >> 1) << 1) ;
2015                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2016                      if (width_mmx)
2017                      {
2018                         int dummy_value_c;  // fix 'forbidden register spilled'
2019                         int dummy_value_S;
2020                         int dummy_value_D;
2022                         __asm__ __volatile__ (
2023                            "subl $2, %%esi          \n\t"
2024                            "subl $6, %%edi          \n\t"
2026                         ".loop2_pass4:              \n\t"
2027                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2028                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2029                            "subl $4, %%esi          \n\t"
2030                            "movq %%mm0, (%%edi)     \n\t"
2031                            "subl $8, %%edi          \n\t"
2032                            "subl $2, %%ecx          \n\t"
2033                            "jnz .loop2_pass4        \n\t"
2034                            "EMMS                    \n\t" // DONE
2036                            : "=c" (dummy_value_c),        // output regs (dummy)
2037                              "=S" (dummy_value_S),
2038                              "=D" (dummy_value_D)
2040                            : "1" (sptr),      // esi      // input regs
2041                              "2" (dp),        // edi
2042                              "0" (width_mmx)  // ecx
2044 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2045                            : "%mm0"                       // clobber list
2046 #endif
2047                         );
2048                      }
2050                      sptr -= (width_mmx*2 - 2); // sign fixed
2051                      dp -= (width_mmx*4 - 2);   // sign fixed
2052                      for (i = width; i; i--)
2053                      {
2054                         png_byte v[8];
2055                         int j;
2056                         sptr -= 2;
2057                         png_memcpy(v, sptr, 2);
2058                         for (j = 0; j < png_pass_inc[pass]; j++)
2059                         {
2060                            dp -= 2;
2061                            png_memcpy(dp, v, 2);
2062                         }
2063                      }
2064                   }
2065                } /* end of pixel_bytes == 2 */
2067                //--------------------------------------------------------------
2068                else if (pixel_bytes == 4)
2069                {
2070                   if (((pass == 0) || (pass == 1)) && width)
2071                   {
2072                      int width_mmx = ((width >> 1) << 1);
2073                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2074                      if (width_mmx)
2075                      {
2076                         int dummy_value_c;  // fix 'forbidden register spilled'
2077                         int dummy_value_S;
2078                         int dummy_value_D;
2080                         __asm__ __volatile__ (
2081                            "subl $4, %%esi          \n\t"
2082                            "subl $60, %%edi         \n\t"
2084                         ".loop4_pass0:              \n\t"
2085                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2086                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2087                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2088                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2089                            "movq %%mm0, (%%edi)     \n\t"
2090                            "movq %%mm0, 8(%%edi)    \n\t"
2091                            "movq %%mm0, 16(%%edi)   \n\t"
2092                            "movq %%mm0, 24(%%edi)   \n\t"
2093                            "movq %%mm1, 32(%%edi)   \n\t"
2094                            "movq %%mm1, 40(%%edi)   \n\t"
2095                            "movq %%mm1, 48(%%edi)   \n\t"
2096                            "subl $8, %%esi          \n\t"
2097                            "movq %%mm1, 56(%%edi)   \n\t"
2098                            "subl $64, %%edi         \n\t"
2099                            "subl $2, %%ecx          \n\t"
2100                            "jnz .loop4_pass0        \n\t"
2101                            "EMMS                    \n\t" // DONE
2103                            : "=c" (dummy_value_c),        // output regs (dummy)
2104                              "=S" (dummy_value_S),
2105                              "=D" (dummy_value_D)
2107                            : "1" (sptr),      // esi      // input regs
2108                              "2" (dp),        // edi
2109                              "0" (width_mmx)  // ecx
2111 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2112                            : "%mm0", "%mm1"               // clobber list
2113 #endif
2114                         );
2115                      }
2117                      sptr -= (width_mmx*4 - 4); // sign fixed
2118                      dp -= (width_mmx*32 - 4);  // sign fixed
2119                      for (i = width; i; i--)
2120                      {
2121                         png_byte v[8];
2122                         int j;
2123                         sptr -= 4;
2124                         png_memcpy(v, sptr, 4);
2125                         for (j = 0; j < png_pass_inc[pass]; j++)
2126                         {
2127                            dp -= 4;
2128                            png_memcpy(dp, v, 4);
2129                         }
2130                      }
2131                   }
2132                   else if (((pass == 2) || (pass == 3)) && width)
2133                   {
2134                      int width_mmx = ((width >> 1) << 1);
2135                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2136                      if (width_mmx)
2137                      {
2138                         int dummy_value_c;  // fix 'forbidden register spilled'
2139                         int dummy_value_S;
2140                         int dummy_value_D;
2142                         __asm__ __volatile__ (
2143                            "subl $4, %%esi          \n\t"
2144                            "subl $28, %%edi         \n\t"
2146                         ".loop4_pass2:              \n\t"
2147                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2148                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2149                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2150                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2151                            "movq %%mm0, (%%edi)     \n\t"
2152                            "movq %%mm0, 8(%%edi)    \n\t"
2153                            "movq %%mm1, 16(%%edi)   \n\t"
2154                            "movq %%mm1, 24(%%edi)   \n\t"
2155                            "subl $8, %%esi          \n\t"
2156                            "subl $32, %%edi         \n\t"
2157                            "subl $2, %%ecx          \n\t"
2158                            "jnz .loop4_pass2        \n\t"
2159                            "EMMS                    \n\t" // DONE
2161                            : "=c" (dummy_value_c),        // output regs (dummy)
2162                              "=S" (dummy_value_S),
2163                              "=D" (dummy_value_D)
2165                            : "1" (sptr),      // esi      // input regs
2166                              "2" (dp),        // edi
2167                              "0" (width_mmx)  // ecx
2169 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2170                            : "%mm0", "%mm1"               // clobber list
2171 #endif
2172                         );
2173                      }
2175                      sptr -= (width_mmx*4 - 4); // sign fixed
2176                      dp -= (width_mmx*16 - 4);  // sign fixed
2177                      for (i = width; i; i--)
2178                      {
2179                         png_byte v[8];
2180                         int j;
2181                         sptr -= 4;
2182                         png_memcpy(v, sptr, 4);
2183                         for (j = 0; j < png_pass_inc[pass]; j++)
2184                         {
2185                            dp -= 4;
2186                            png_memcpy(dp, v, 4);
2187                         }
2188                      }
2189                   }
2190                   else if (width)  // pass == 4 or 5
2191                   {
2192                      int width_mmx = ((width >> 1) << 1) ;
2193                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2194                      if (width_mmx)
2195                      {
2196                         int dummy_value_c;  // fix 'forbidden register spilled'
2197                         int dummy_value_S;
2198                         int dummy_value_D;
2200                         __asm__ __volatile__ (
2201                            "subl $4, %%esi          \n\t"
2202                            "subl $12, %%edi         \n\t"
2204                         ".loop4_pass4:              \n\t"
2205                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2206                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2207                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2208                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2209                            "movq %%mm0, (%%edi)     \n\t"
2210                            "subl $8, %%esi          \n\t"
2211                            "movq %%mm1, 8(%%edi)    \n\t"
2212                            "subl $16, %%edi         \n\t"
2213                            "subl $2, %%ecx          \n\t"
2214                            "jnz .loop4_pass4        \n\t"
2215                            "EMMS                    \n\t" // DONE
2217                            : "=c" (dummy_value_c),        // output regs (dummy)
2218                              "=S" (dummy_value_S),
2219                              "=D" (dummy_value_D)
2221                            : "1" (sptr),      // esi      // input regs
2222                              "2" (dp),        // edi
2223                              "0" (width_mmx)  // ecx
2225 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2226                            : "%mm0", "%mm1"               // clobber list
2227 #endif
2228                         );
2229                      }
2231                      sptr -= (width_mmx*4 - 4); // sign fixed
2232                      dp -= (width_mmx*8 - 4);   // sign fixed
2233                      for (i = width; i; i--)
2234                      {
2235                         png_byte v[8];
2236                         int j;
2237                         sptr -= 4;
2238                         png_memcpy(v, sptr, 4);
2239                         for (j = 0; j < png_pass_inc[pass]; j++)
2240                         {
2241                            dp -= 4;
2242                            png_memcpy(dp, v, 4);
2243                         }
2244                      }
2245                   }
2246                } /* end of pixel_bytes == 4 */
2248                //--------------------------------------------------------------
2249                else if (pixel_bytes == 8)
2250                {
2251 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2252                   // GRR NOTE:  no need to combine passes here!
2253                   if (((pass == 0) || (pass == 1)) && width)
2254                   {
2255                      int dummy_value_c;  // fix 'forbidden register spilled'
2256                      int dummy_value_S;
2257                      int dummy_value_D;
2259                      // source is 8-byte RRGGBBAA
2260                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2261                      __asm__ __volatile__ (
2262                         "subl $56, %%edi         \n\t" // start of last block
2264                      ".loop8_pass0:              \n\t"
2265                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2266                         "movq %%mm0, (%%edi)     \n\t"
2267                         "movq %%mm0, 8(%%edi)    \n\t"
2268                         "movq %%mm0, 16(%%edi)   \n\t"
2269                         "movq %%mm0, 24(%%edi)   \n\t"
2270                         "movq %%mm0, 32(%%edi)   \n\t"
2271                         "movq %%mm0, 40(%%edi)   \n\t"
2272                         "movq %%mm0, 48(%%edi)   \n\t"
2273                         "subl $8, %%esi          \n\t"
2274                         "movq %%mm0, 56(%%edi)   \n\t"
2275                         "subl $64, %%edi         \n\t"
2276                         "decl %%ecx              \n\t"
2277                         "jnz .loop8_pass0        \n\t"
2278                         "EMMS                    \n\t" // DONE
2280                         : "=c" (dummy_value_c),        // output regs (dummy)
2281                           "=S" (dummy_value_S),
2282                           "=D" (dummy_value_D)
2284                         : "1" (sptr),      // esi      // input regs
2285                           "2" (dp),        // edi
2286                           "0" (width)      // ecx
2288 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2289                         : "%mm0"                       // clobber list
2290 #endif
2291                      );
2292                   }
2293                   else if (((pass == 2) || (pass == 3)) && width)
2294                   {
2295                      // source is 8-byte RRGGBBAA
2296                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2297                      int width_mmx = ((width >> 1) << 1) ;
2298                      width -= width_mmx;
2299                      if (width_mmx)
2300                      {
2301                         int dummy_value_c;  // fix 'forbidden register spilled'
2302                         int dummy_value_S;
2303                         int dummy_value_D;
2305                         __asm__ __volatile__ (
2306                            "subl $24, %%edi         \n\t" // start of last block
2308                         ".loop8_pass2:              \n\t"
2309                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2310                            "movq %%mm0, (%%edi)     \n\t"
2311                            "movq %%mm0, 8(%%edi)    \n\t"
2312                            "movq %%mm0, 16(%%edi)   \n\t"
2313                            "subl $8, %%esi          \n\t"
2314                            "movq %%mm0, 24(%%edi)   \n\t"
2315                            "subl $32, %%edi         \n\t"
2316                            "decl %%ecx              \n\t"
2317                            "jnz .loop8_pass2        \n\t"
2318                            "EMMS                    \n\t" // DONE
2320                            : "=c" (dummy_value_c),        // output regs (dummy)
2321                              "=S" (dummy_value_S),
2322                              "=D" (dummy_value_D)
2324                            : "1" (sptr),      // esi      // input regs
2325                              "2" (dp),        // edi
2326                              "0" (width)      // ecx
2328 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2329                            : "%mm0"                       // clobber list
2330 #endif
2331                         );
2332                      }
2333                   }
2334                   else if (width)  // pass == 4 or 5
2335                   {
2336                      // source is 8-byte RRGGBBAA
2337                      // dest is 16-byte RRGGBBAA RRGGBBAA
2338                      int width_mmx = ((width >> 1) << 1) ;
2339                      width -= width_mmx;
2340                      if (width_mmx)
2341                      {
2342                         int dummy_value_c;  // fix 'forbidden register spilled'
2343                         int dummy_value_S;
2344                         int dummy_value_D;
2346                         __asm__ __volatile__ (
2347                            "subl $8, %%edi          \n\t" // start of last block
2349                         ".loop8_pass4:              \n\t"
2350                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2351                            "movq %%mm0, (%%edi)     \n\t"
2352                            "subl $8, %%esi          \n\t"
2353                            "movq %%mm0, 8(%%edi)    \n\t"
2354                            "subl $16, %%edi         \n\t"
2355                            "decl %%ecx              \n\t"
2356                            "jnz .loop8_pass4        \n\t"
2357                            "EMMS                    \n\t" // DONE
2359                            : "=c" (dummy_value_c),        // output regs (dummy)
2360                              "=S" (dummy_value_S),
2361                              "=D" (dummy_value_D)
2363                            : "1" (sptr),      // esi      // input regs
2364                              "2" (dp),        // edi
2365                              "0" (width)      // ecx
2367 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2368                            : "%mm0"                       // clobber list
2369 #endif
2370                         );
2371                      }
2372                   }
2374                } /* end of pixel_bytes == 8 */
2376                //--------------------------------------------------------------
2377                else if (pixel_bytes == 6)
2378                {
2379                   for (i = width; i; i--)
2380                   {
2381                      png_byte v[8];
2382                      int j;
2383                      png_memcpy(v, sptr, 6);
2384                      for (j = 0; j < png_pass_inc[pass]; j++)
2385                      {
2386                         png_memcpy(dp, v, 6);
2387                         dp -= 6;
2388                      }
2389                      sptr -= 6;
2390                   }
2391                } /* end of pixel_bytes == 6 */
2393                //--------------------------------------------------------------
2394                else
2395                {
2396                   for (i = width; i; i--)
2397                   {
2398                      png_byte v[8];
2399                      int j;
2400                      png_memcpy(v, sptr, pixel_bytes);
2401                      for (j = 0; j < png_pass_inc[pass]; j++)
2402                      {
2403                         png_memcpy(dp, v, pixel_bytes);
2404                         dp -= pixel_bytes;
2405                      }
2406                      sptr-= pixel_bytes;
2407                   }
2408                }
2409             } // end of _mmx_supported ========================================
2411             else /* MMX not supported:  use modified C code - takes advantage
2412                   *   of inlining of memcpy for a constant */
2413                  /* GRR 19991007:  does it?  or should pixel_bytes in each
2414                   *   block be replaced with immediate value (e.g., 1)? */
2415                  /* GRR 19991017:  replaced with constants in each case */
2416             {
2417                if (pixel_bytes == 1)
2418                {
2419                   for (i = width; i; i--)
2420                   {
2421                      int j;
2422                      for (j = 0; j < png_pass_inc[pass]; j++)
2423                         *dp-- = *sptr;
2424                      --sptr;
2425                   }
2426                }
2427                else if (pixel_bytes == 3)
2428                {
2429                   for (i = width; i; i--)
2430                   {
2431                      png_byte v[8];
2432                      int j;
2433                      png_memcpy(v, sptr, 3);
2434                      for (j = 0; j < png_pass_inc[pass]; j++)
2435                      {
2436                         png_memcpy(dp, v, 3);
2437                         dp -= 3;
2438                      }
2439                      sptr -= 3;
2440                   }
2441                }
2442                else if (pixel_bytes == 2)
2443                {
2444                   for (i = width; i; i--)
2445                   {
2446                      png_byte v[8];
2447                      int j;
2448                      png_memcpy(v, sptr, 2);
2449                      for (j = 0; j < png_pass_inc[pass]; j++)
2450                      {
2451                         png_memcpy(dp, v, 2);
2452                         dp -= 2;
2453                      }
2454                      sptr -= 2;
2455                   }
2456                }
2457                else if (pixel_bytes == 4)
2458                {
2459                   for (i = width; i; i--)
2460                   {
2461                      png_byte v[8];
2462                      int j;
2463                      png_memcpy(v, sptr, 4);
2464                      for (j = 0; j < png_pass_inc[pass]; j++)
2465                      {
2466                         png_memcpy(dp, v, 4);
2467                         dp -= 4;
2468                      }
2469                      sptr -= 4;
2470                   }
2471                }
2472                else if (pixel_bytes == 6)
2473                {
2474                   for (i = width; i; i--)
2475                   {
2476                      png_byte v[8];
2477                      int j;
2478                      png_memcpy(v, sptr, 6);
2479                      for (j = 0; j < png_pass_inc[pass]; j++)
2480                      {
2481                         png_memcpy(dp, v, 6);
2482                         dp -= 6;
2483                      }
2484                      sptr -= 6;
2485                   }
2486                }
2487                else if (pixel_bytes == 8)
2488                {
2489                   for (i = width; i; i--)
2490                   {
2491                      png_byte v[8];
2492                      int j;
2493                      png_memcpy(v, sptr, 8);
2494                      for (j = 0; j < png_pass_inc[pass]; j++)
2495                      {
2496                         png_memcpy(dp, v, 8);
2497                         dp -= 8;
2498                      }
2499                      sptr -= 8;
2500                   }
2501                }
2502                else     // GRR:  should never be reached
2503                {
2504                   for (i = width; i; i--)
2505                   {
2506                      png_byte v[8];
2507                      int j;
2508                      png_memcpy(v, sptr, pixel_bytes);
2509                      for (j = 0; j < png_pass_inc[pass]; j++)
2510                      {
2511                         png_memcpy(dp, v, pixel_bytes);
2512                         dp -= pixel_bytes;
2513                      }
2514                      sptr -= pixel_bytes;
2515                   }
2516                }
2518             } /* end if (MMX not supported) */
2519             break;
2520          }
2521       } /* end switch (row_info->pixel_depth) */
2523       row_info->width = final_width;
2524       row_info->rowbytes = ((final_width *
2525          (png_uint_32)row_info->pixel_depth + 7) >> 3);
2526    }
2528 } /* end png_do_read_interlace() */
2530 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2531 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2536 // These variables are utilized in the functions below.  They are declared
2537 // globally here to ensure alignment on 8-byte boundaries.
2539 union uAll {
2540    long long use;
2541    double  align;
2542 } _LBCarryMask = {0x0101010101010101LL},
2543   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2544   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2549 //===========================================================================//
2550 //                                                                           //
2551 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
2552 //                                                                           //
2553 //===========================================================================//
2555 // Optimized code for PNG Average filter decoder
2557 static void /* PRIVATE */
2558 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2559                             png_bytep prev_row)
2561    int bpp;
2562    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
2563    int dummy_value_S;
2564    int dummy_value_D;
2566    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
2567    _FullLength  = row_info->rowbytes;       // # of bytes to filter
2569    __asm__ __volatile__ (
2570       // initialize address pointers and offset
2571 #ifdef __PIC__
2572       "pushl %%ebx                 \n\t" // save index to Global Offset Table
2573 #endif
2574 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
2575       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
2576       "movl %%edi, %%edx           \n\t"
2577 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2578 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2579       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2581       "xorl %%eax,%%eax            \n\t"
2583       // Compute the Raw value for the first bpp bytes
2584       //    Raw(x) = Avg(x) + (Prior(x)/2)
2585    "avg_rlp:                       \n\t"
2586       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
2587       "incl %%ebx                  \n\t"
2588       "shrb %%al                   \n\t" // divide by 2
2589       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
2590 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
2591       "cmpl %%ecx, %%ebx           \n\t"
2592       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
2593       "jb avg_rlp                  \n\t" // mov does not affect flags
2595       // get # of bytes to alignment
2596       "movl %%edi, _dif            \n\t" // take start of row
2597       "addl %%ebx, _dif            \n\t" // add bpp
2598       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
2599       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
2600       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
2601       "jz avg_go                   \n\t" //  alignment
2603       // fix alignment
2604       // Compute the Raw value for the bytes up to the alignment boundary
2605       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2606       "xorl %%ecx, %%ecx           \n\t"
2608    "avg_lp1:                       \n\t"
2609       "xorl %%eax, %%eax           \n\t"
2610       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2611       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2612       "addw %%cx, %%ax             \n\t"
2613       "incl %%ebx                  \n\t"
2614       "shrw %%ax                   \n\t" // divide by 2
2615       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2616       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
2617       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2618       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
2620    "avg_go:                        \n\t"
2621       "movl _FullLength, %%eax     \n\t"
2622       "movl %%eax, %%ecx           \n\t"
2623       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
2624       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
2625       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
2626       "movl %%ecx, _MMXLength      \n\t"
2627 #ifdef __PIC__
2628       "popl %%ebx                  \n\t" // restore index to Global Offset Table
2629 #endif
2631       : "=c" (dummy_value_c),            // output regs (dummy)
2632         "=S" (dummy_value_S),
2633         "=D" (dummy_value_D)
2635       : "0" (bpp),       // ecx          // input regs
2636         "1" (prev_row),  // esi
2637         "2" (row)        // edi
2639       : "%eax", "%edx"                   // clobber list
2640 #ifndef __PIC__
2641       , "%ebx"
2642 #endif
2643       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2644       // (seems to work fine without...)
2645    );
2647    // now do the math for the rest of the row
2648    switch (bpp)
2649    {
2650       case 3:
2651       {
2652          _ActiveMask.use  = 0x0000000000ffffffLL;
2653          _ShiftBpp.use = 24;    // == 3 * 8
2654          _ShiftRem.use = 40;    // == 64 - 24
2656          __asm__ __volatile__ (
2657             // re-init address pointers and offset
2658             "movq _ActiveMask, %%mm7      \n\t"
2659             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
2660             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
2661 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
2662             "movq _HBClearMask, %%mm4     \n\t"
2663 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
2665             // prime the pump:  load the first Raw(x-bpp) data set
2666             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2667                                                 // (correct pos. in loop below)
2668          "avg_3lp:                        \n\t"
2669             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
2670             "movq %%mm5, %%mm3            \n\t"
2671             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp) data
2672             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
2673             "movq %%mm7, %%mm6            \n\t"
2674             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
2675             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
2676             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each byte
2677             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for each byte
2678             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2679             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
2680             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
2681                                // lsb's were == 1 (only valid for active group)
2682             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2683             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
2684             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2685             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1 bytes to add to Avg
2686             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2687                                //  byte
2688             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2689             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover bytes 3-5
2690             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2691             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2692             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
2693             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
2694                                // lsb's were == 1 (only valid for active group)
2695             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2696             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
2697             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2698             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2 bytes to add to Avg
2699             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2700                                //  byte
2702             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2703             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last two
2704                                  // bytes
2705             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2706             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2707                               // Data only needs to be shifted once here to
2708                               // get the correct x-bpp offset.
2709             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
2710             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
2711                               // lsb's were == 1 (only valid for active group)
2712             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2713             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
2714             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2715             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2 bytes to add to Avg
2716             "addl $8, %%ecx               \n\t"
2717             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2718                                                 // byte
2719             // now ready to write back to memory
2720             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2721             // move updated Raw(x) to use as Raw(x-bpp) for next loop
2722             "cmpl _MMXLength, %%ecx       \n\t"
2723             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
2724             "jb avg_3lp                   \n\t"
2726             : "=S" (dummy_value_S),             // output regs (dummy)
2727               "=D" (dummy_value_D)
2729             : "0" (prev_row),  // esi           // input regs
2730               "1" (row)        // edi
2732             : "%ecx"                            // clobber list
2733 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2734             , "%mm0", "%mm1", "%mm2", "%mm3"
2735             , "%mm4", "%mm5", "%mm6", "%mm7"
2736 #endif
2737          );
2738       }
2739       break;  // end 3 bpp
2741       case 6:
2742       case 4:
2743       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
2744       //case 5:   // GRR BOGUS
2745       {
2746          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
2747                                                   // appropriate inactive bytes
2748          _ShiftBpp.use = bpp << 3;
2749          _ShiftRem.use = 64 - _ShiftBpp.use;
2751          __asm__ __volatile__ (
2752             "movq _HBClearMask, %%mm4    \n\t"
2754             // re-init address pointers and offset
2755             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment boundary
2757             // load _ActiveMask and clear all bytes except for 1st active group
2758             "movq _ActiveMask, %%mm7     \n\t"
2759 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2760             "psrlq _ShiftRem, %%mm7      \n\t"
2761 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2762             "movq %%mm7, %%mm6           \n\t"
2763             "movq _LBCarryMask, %%mm5    \n\t"
2764             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active group
2766             // prime the pump:  load the first Raw(x-bpp) data set
2767             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2768                                           // (we correct pos. in loop below)
2769          "avg_4lp:                       \n\t"
2770             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2771             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
2772             "movq (%%esi,%%ecx,), %%mm1  \n\t"
2773             // add (Prev_row/2) to average
2774             "movq %%mm5, %%mm3           \n\t"
2775             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
2776             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
2777             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
2778             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
2779             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2780             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2781             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2782                               // lsb's were == 1 (only valid for active group)
2783             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2784             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2785             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2786             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1 bytes to add to Avg
2787             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2788                               // byte
2789             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2790             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2791             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2792             "addl $8, %%ecx              \n\t"
2793             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2794             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2795                               // lsb's were == 1 (only valid for active group)
2796             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2797             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2798             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2799             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2800             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2801                               // byte
2802             "cmpl _MMXLength, %%ecx      \n\t"
2803             // now ready to write back to memory
2804             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2805             // prep Raw(x-bpp) for next loop
2806             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2807             "jb avg_4lp                  \n\t"
2809             : "=S" (dummy_value_S),            // output regs (dummy)
2810               "=D" (dummy_value_D)
2812             : "0" (prev_row),  // esi          // input regs
2813               "1" (row)        // edi
2815             : "%ecx"                           // clobber list
2816 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2817             , "%mm0", "%mm1", "%mm2", "%mm3"
2818             , "%mm4", "%mm5", "%mm6", "%mm7"
2819 #endif
2820          );
2821       }
2822       break;  // end 4,6 bpp
2824       case 2:
2825       {
2826          _ActiveMask.use  = 0x000000000000ffffLL;
2827          _ShiftBpp.use = 16;   // == 2 * 8
2828          _ShiftRem.use = 48;   // == 64 - 16
2830          __asm__ __volatile__ (
2831             // load _ActiveMask
2832             "movq _ActiveMask, %%mm7     \n\t"
2833             // re-init address pointers and offset
2834             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment boundary
2835             "movq _LBCarryMask, %%mm5    \n\t"
2836 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2837             "movq _HBClearMask, %%mm4    \n\t"
2838 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2840             // prime the pump:  load the first Raw(x-bpp) data set
2841             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2842                               // (we correct pos. in loop below)
2843          "avg_2lp:                       \n\t"
2844             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2845             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
2846             "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
2847             // add (Prev_row/2) to average
2848             "movq %%mm5, %%mm3           \n\t"
2849             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
2850             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
2851             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
2852             "movq %%mm7, %%mm6           \n\t"
2853             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
2855             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2856             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2857             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2858                                                // lsb's were == 1 (only valid for active group)
2859             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2860             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2861             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2862             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1 bytes to add to Avg
2863             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2865             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2866             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 2 & 3
2867             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2868             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2869             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2870             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2871                                                // lsb's were == 1 (only valid for active group)
2872             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2873             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2874             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2875             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2876             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2878             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2879             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 4 & 5
2880             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2881             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2882             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2883             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2884                                                // lsb's were == 1 (only valid for active group)
2885             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2886             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2887             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2888             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2889             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2891             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
2892             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 6 & 7
2893             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2894             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2895             "addl $8, %%ecx              \n\t"
2896             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2897             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2898                                                // lsb's were == 1 (only valid for active group)
2899             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2900             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2901             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2902             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2903             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2905             "cmpl _MMXLength, %%ecx      \n\t"
2906             // now ready to write back to memory
2907             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2908             // prep Raw(x-bpp) for next loop
2909             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2910             "jb avg_2lp                  \n\t"
2912             : "=S" (dummy_value_S),            // output regs (dummy)
2913               "=D" (dummy_value_D)
2915             : "0" (prev_row),  // esi          // input regs
2916               "1" (row)        // edi
2918             : "%ecx"                           // clobber list
2919 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2920             , "%mm0", "%mm1", "%mm2", "%mm3"
2921             , "%mm4", "%mm5", "%mm6", "%mm7"
2922 #endif
2923          );
2924       }
2925       break;  // end 2 bpp
2927       case 1:
2928       {
2929          __asm__ __volatile__ (
2930             // re-init address pointers and offset
2931 #ifdef __PIC__
2932             "pushl %%ebx                 \n\t" // save Global Offset Table index
2933 #endif
2934             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment boundary
2935 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2936             "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
2937             "jnb avg_1end                \n\t"
2938             // do Paeth decode for remaining bytes
2939 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2940             "movl %%edi, %%edx           \n\t"
2941 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2942             "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2943             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
2944                                                //  in loop below
2945          "avg_1lp:                       \n\t"
2946             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2947             "xorl %%eax, %%eax           \n\t"
2948             "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2949             "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2950             "addw %%cx, %%ax             \n\t"
2951             "incl %%ebx                  \n\t"
2952             "shrw %%ax                   \n\t" // divide by 2
2953             "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2954             "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
2955             "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
2956                          // mov does not affect flags; -1 to offset inc ebx
2957             "jb avg_1lp                  \n\t"
2959          "avg_1end:                      \n\t"
2960 #ifdef __PIC__
2961             "popl %%ebx                  \n\t" // Global Offset Table index
2962 #endif
2964             : "=c" (dummy_value_c),            // output regs (dummy)
2965               "=S" (dummy_value_S),
2966               "=D" (dummy_value_D)
2968             : "0" (bpp),       // ecx          // input regs
2969               "1" (prev_row),  // esi
2970               "2" (row)        // edi
2972             : "%eax", "%edx"                   // clobber list
2973 #ifndef __PIC__
2974             , "%ebx"
2975 #endif
2976          );
2977       }
2978       return;  // end 1 bpp
2980       case 8:
2981       {
2982          __asm__ __volatile__ (
2983             // re-init address pointers and offset
2984             "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
2985             "movq _LBCarryMask, %%mm5    \n\t" //            boundary
2986 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2987             "movq _HBClearMask, %%mm4    \n\t"
2988 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2990             // prime the pump:  load the first Raw(x-bpp) data set
2991             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2992                                       // (NO NEED to correct pos. in loop below)
2994          "avg_8lp:                       \n\t"
2995             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2996             "movq %%mm5, %%mm3           \n\t"
2997             "movq (%%esi,%%ecx,), %%mm1  \n\t"
2998             "addl $8, %%ecx              \n\t"
2999             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3000             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3001             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3002                                                //  where both lsb's were == 1
3003             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3004             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
3005             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
3006             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
3007             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
3008             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3009             "cmpl _MMXLength, %%ecx      \n\t"
3010             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3011             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
3012             "jb avg_8lp                  \n\t"
3014             : "=S" (dummy_value_S),            // output regs (dummy)
3015               "=D" (dummy_value_D)
3017             : "0" (prev_row),  // esi          // input regs
3018               "1" (row)        // edi
3020             : "%ecx"                           // clobber list
3021 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3022             , "%mm0", "%mm1", "%mm2"
3023             , "%mm3", "%mm4", "%mm5"
3024 #endif
3025          );
3026       }
3027       break;  // end 8 bpp
3029       default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3030       {
3032          // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
3033          fprintf(stderr,
3034            "libpng:  internal logic error (png_read_filter_row_mmx_avg())\n");
3036 #if 0
3037         __asm__ __volatile__ (
3038             "movq _LBCarryMask, %%mm5    \n\t"
3039             // re-init address pointers and offset
3040             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment boundary
3041             "movl row, %%edi             \n\t" // edi:  Avg(x)
3042             "movq _HBClearMask, %%mm4    \n\t"
3043             "movl %%edi, %%edx           \n\t"
3044             "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3045             "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
3046          "avg_Alp:                       \n\t"
3047             "movq (%%edi,%%ebx,), %%mm0  \n\t"
3048             "movq %%mm5, %%mm3           \n\t"
3049             "movq (%%esi,%%ebx,), %%mm1  \n\t"
3050             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3051             "movq (%%edx,%%ebx,), %%mm2  \n\t"
3052             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3053             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte where both
3054                                 // lsb's were == 1
3055             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3056             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
3057             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each byte
3058             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
3059             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
3060             "addl $8, %%ebx              \n\t"
3061             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each byte
3062             "cmpl _MMXLength, %%ebx      \n\t"
3063             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3064             "jb avg_Alp                  \n\t"
3066             : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
3068             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
3070             : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3071          );
3072 #endif /* 0 - NEVER REACHED */
3073       }
3074       break;
3076    } // end switch (bpp)
3078    __asm__ __volatile__ (
3079       // MMX acceleration complete; now do clean-up
3080       // check if any remaining bytes left to decode
3081 #ifdef __PIC__
3082       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3083 #endif
3084       "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
3085 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
3086       "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3087       "jnb avg_end                 \n\t"
3089       // do Avg decode for remaining bytes
3090 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3091       "movl %%edi, %%edx           \n\t"
3092 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3093       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3094       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
3096    "avg_lp2:                       \n\t"
3097       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3098       "xorl %%eax, %%eax           \n\t"
3099       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3100       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3101       "addw %%cx, %%ax             \n\t"
3102       "incl %%ebx                  \n\t"
3103       "shrw %%ax                   \n\t" // divide by 2
3104       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3105       "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3106       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3107       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
3109    "avg_end:                       \n\t"
3110       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
3111 #ifdef __PIC__
3112       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3113 #endif
3115       : "=c" (dummy_value_c),            // output regs (dummy)
3116         "=S" (dummy_value_S),
3117         "=D" (dummy_value_D)
3119       : "0" (bpp),       // ecx          // input regs
3120         "1" (prev_row),  // esi
3121         "2" (row)        // edi
3123       : "%eax", "%edx"                   // clobber list
3124 #ifndef __PIC__
3125       , "%ebx"
3126 #endif
3127    );
3129 } /* end png_read_filter_row_mmx_avg() */
3134 //===========================================================================//
3135 //                                                                           //
3136 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
3137 //                                                                           //
3138 //===========================================================================//
3140 // Optimized code for PNG Paeth filter decoder
3142 static void /* PRIVATE */
3143 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3144                               png_bytep prev_row)
3146    int bpp;
3147    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
3148    int dummy_value_S;
3149    int dummy_value_D;
3151    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3152    _FullLength  = row_info->rowbytes; // # of bytes to filter
3154    __asm__ __volatile__ (
3155 #ifdef __PIC__
3156       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3157 #endif
3158       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
3159 //pre "movl row, %%edi             \n\t"
3160       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
3161 //pre "movl prev_row, %%esi        \n\t"
3162       "xorl %%eax, %%eax           \n\t"
3164       // Compute the Raw value for the first bpp bytes
3165       // Note: the formula works out to be always
3166       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
3167    "paeth_rlp:                     \n\t"
3168       "movb (%%edi,%%ebx,), %%al   \n\t"
3169       "addb (%%esi,%%ebx,), %%al   \n\t"
3170       "incl %%ebx                  \n\t"
3171 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
3172       "cmpl %%ecx, %%ebx           \n\t"
3173       "movb %%al, -1(%%edi,%%ebx,) \n\t"
3174       "jb paeth_rlp                \n\t"
3175       // get # of bytes to alignment
3176       "movl %%edi, _dif            \n\t" // take start of row
3177       "addl %%ebx, _dif            \n\t" // add bpp
3178       "xorl %%ecx, %%ecx           \n\t"
3179       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment boundary
3180       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
3181       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx at alignment
3182       "jz paeth_go                 \n\t"
3183       // fix alignment
3185    "paeth_lp1:                     \n\t"
3186       "xorl %%eax, %%eax           \n\t"
3187       // pav = p - a = (a + b - c) - a = b - c
3188       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
3189       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3190       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3191       "movl %%eax, _patemp         \n\t" // Save pav for later use
3192       "xorl %%eax, %%eax           \n\t"
3193       // pbv = p - b = (a + b - c) - b = a - c
3194       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
3195       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3196       "movl %%eax, %%ecx           \n\t"
3197       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3198       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
3199       // pc = abs(pcv)
3200       "testl $0x80000000, %%eax    \n\t"
3201       "jz paeth_pca                \n\t"
3202       "negl %%eax                  \n\t" // reverse sign of neg values
3204    "paeth_pca:                     \n\t"
3205       "movl %%eax, _pctemp         \n\t" // save pc for later use
3206       // pb = abs(pbv)
3207       "testl $0x80000000, %%ecx    \n\t"
3208       "jz paeth_pba                \n\t"
3209       "negl %%ecx                  \n\t" // reverse sign of neg values
3211    "paeth_pba:                     \n\t"
3212       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
3213       // pa = abs(pav)
3214       "movl _patemp, %%eax         \n\t"
3215       "testl $0x80000000, %%eax    \n\t"
3216       "jz paeth_paa                \n\t"
3217       "negl %%eax                  \n\t" // reverse sign of neg values
3219    "paeth_paa:                     \n\t"
3220       "movl %%eax, _patemp         \n\t" // save pa for later use
3221       // test if pa <= pb
3222       "cmpl %%ecx, %%eax           \n\t"
3223       "jna paeth_abb               \n\t"
3224       // pa > pb; now test if pb <= pc
3225       "cmpl _pctemp, %%ecx         \n\t"
3226       "jna paeth_bbc               \n\t"
3227       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3228       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3229       "jmp paeth_paeth             \n\t"
3231    "paeth_bbc:                     \n\t"
3232       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3233       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
3234       "jmp paeth_paeth             \n\t"
3236    "paeth_abb:                     \n\t"
3237       // pa <= pb; now test if pa <= pc
3238       "cmpl _pctemp, %%eax         \n\t"
3239       "jna paeth_abc               \n\t"
3240       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3241       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3242       "jmp paeth_paeth             \n\t"
3244    "paeth_abc:                     \n\t"
3245       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3246       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
3248    "paeth_paeth:                   \n\t"
3249       "incl %%ebx                  \n\t"
3250       "incl %%edx                  \n\t"
3251       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3252       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3253       "cmpl _dif, %%ebx            \n\t"
3254       "jb paeth_lp1                \n\t"
3256    "paeth_go:                      \n\t"
3257       "movl _FullLength, %%ecx     \n\t"
3258       "movl %%ecx, %%eax           \n\t"
3259       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
3260       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
3261       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
3262       "movl %%ecx, _MMXLength      \n\t"
3263 #ifdef __PIC__
3264       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3265 #endif
3267       : "=c" (dummy_value_c),            // output regs (dummy)
3268         "=S" (dummy_value_S),
3269         "=D" (dummy_value_D)
3271       : "0" (bpp),       // ecx          // input regs
3272         "1" (prev_row),  // esi
3273         "2" (row)        // edi
3275       : "%eax", "%edx"                   // clobber list
3276 #ifndef __PIC__
3277       , "%ebx"
3278 #endif
3279    );
3281    // now do the math for the rest of the row
3282    switch (bpp)
3283    {
3284       case 3:
3285       {
3286          _ActiveMask.use = 0x0000000000ffffffLL;
3287          _ActiveMaskEnd.use = 0xffff000000000000LL;
3288          _ShiftBpp.use = 24;    // == bpp(3) * 8
3289          _ShiftRem.use = 40;    // == 64 - 24
3291          __asm__ __volatile__ (
3292             "movl _dif, %%ecx            \n\t"
3293 // preload  "movl row, %%edi             \n\t"
3294 // preload  "movl prev_row, %%esi        \n\t"
3295             "pxor %%mm0, %%mm0           \n\t"
3296             // prime the pump:  load the first Raw(x-bpp) data set
3297             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3298          "paeth_3lp:                     \n\t"
3299             "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st 3 bytes
3300             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3301             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3302             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3303             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3304             "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st 3 bytes
3305             // pav = p - a = (a + b - c) - a = b - c
3306             "movq %%mm2, %%mm4           \n\t"
3307             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3308             // pbv = p - b = (a + b - c) - b = a - c
3309             "movq %%mm1, %%mm5           \n\t"
3310             "psubw %%mm3, %%mm4          \n\t"
3311             "pxor %%mm7, %%mm7           \n\t"
3312             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3313             "movq %%mm4, %%mm6           \n\t"
3314             "psubw %%mm3, %%mm5          \n\t"
3316             // pa = abs(p-a) = abs(pav)
3317             // pb = abs(p-b) = abs(pbv)
3318             // pc = abs(p-c) = abs(pcv)
3319             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3320             "paddw %%mm5, %%mm6          \n\t"
3321             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3322             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3323             "psubw %%mm0, %%mm4          \n\t"
3324             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3325             "psubw %%mm0, %%mm4          \n\t"
3326             "psubw %%mm7, %%mm5          \n\t"
3327             "pxor %%mm0, %%mm0           \n\t"
3328             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3329             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3330             "psubw %%mm7, %%mm5          \n\t"
3331             "psubw %%mm0, %%mm6          \n\t"
3332             //  test pa <= pb
3333             "movq %%mm4, %%mm7           \n\t"
3334             "psubw %%mm0, %%mm6          \n\t"
3335             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3336             "movq %%mm7, %%mm0           \n\t"
3337             // use mm7 mask to merge pa & pb
3338             "pand %%mm7, %%mm5           \n\t"
3339             // use mm0 mask copy to merge a & b
3340             "pand %%mm0, %%mm2           \n\t"
3341             "pandn %%mm4, %%mm7          \n\t"
3342             "pandn %%mm1, %%mm0          \n\t"
3343             "paddw %%mm5, %%mm7          \n\t"
3344             "paddw %%mm2, %%mm0          \n\t"
3345             //  test  ((pa <= pb)? pa:pb) <= pc
3346             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3347             "pxor %%mm1, %%mm1           \n\t"
3348             "pand %%mm7, %%mm3           \n\t"
3349             "pandn %%mm0, %%mm7          \n\t"
3350             "paddw %%mm3, %%mm7          \n\t"
3351             "pxor %%mm0, %%mm0           \n\t"
3352             "packuswb %%mm1, %%mm7       \n\t"
3353             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3354             "pand _ActiveMask, %%mm7     \n\t"
3355             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
3356             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3357             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3358             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3359             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
3360             // now do Paeth for 2nd set of bytes (3-5)
3361             "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
3362             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3363             "pxor %%mm7, %%mm7           \n\t"
3364             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3365             // pbv = p - b = (a + b - c) - b = a - c
3366             "movq %%mm1, %%mm5           \n\t"
3367             // pav = p - a = (a + b - c) - a = b - c
3368             "movq %%mm2, %%mm4           \n\t"
3369             "psubw %%mm3, %%mm5          \n\t"
3370             "psubw %%mm3, %%mm4          \n\t"
3371             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3372             //       pav + pbv = pbv + pav
3373             "movq %%mm5, %%mm6           \n\t"
3374             "paddw %%mm4, %%mm6          \n\t"
3376             // pa = abs(p-a) = abs(pav)
3377             // pb = abs(p-b) = abs(pbv)
3378             // pc = abs(p-c) = abs(pcv)
3379             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
3380             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
3381             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
3382             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
3383             "psubw %%mm0, %%mm5          \n\t"
3384             "psubw %%mm7, %%mm4          \n\t"
3385             "psubw %%mm0, %%mm5          \n\t"
3386             "psubw %%mm7, %%mm4          \n\t"
3387             "pxor %%mm0, %%mm0           \n\t"
3388             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3389             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3390             "psubw %%mm0, %%mm6          \n\t"
3391             //  test pa <= pb
3392             "movq %%mm4, %%mm7           \n\t"
3393             "psubw %%mm0, %%mm6          \n\t"
3394             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3395             "movq %%mm7, %%mm0           \n\t"
3396             // use mm7 mask to merge pa & pb
3397             "pand %%mm7, %%mm5           \n\t"
3398             // use mm0 mask copy to merge a & b
3399             "pand %%mm0, %%mm2           \n\t"
3400             "pandn %%mm4, %%mm7          \n\t"
3401             "pandn %%mm1, %%mm0          \n\t"
3402             "paddw %%mm5, %%mm7          \n\t"
3403             "paddw %%mm2, %%mm0          \n\t"
3404             //  test  ((pa <= pb)? pa:pb) <= pc
3405             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3406             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3407             "pand %%mm7, %%mm3           \n\t"
3408             "pandn %%mm0, %%mm7          \n\t"
3409             "pxor %%mm1, %%mm1           \n\t"
3410             "paddw %%mm3, %%mm7          \n\t"
3411             "pxor %%mm0, %%mm0           \n\t"
3412             "packuswb %%mm1, %%mm7       \n\t"
3413             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
3414             "pand _ActiveMask, %%mm7     \n\t"
3415             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3416             "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of 3 bytes
3417              // pav = p - a = (a + b - c) - a = b - c
3418             "movq %%mm2, %%mm4           \n\t"
3419             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3420             "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
3421             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3422             "movq %%mm7, %%mm1           \n\t"
3423             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3424             "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
3425                                     // now mm1 will be used as Raw(x-bpp)
3426             // now do Paeth for 3rd, and final, set of bytes (6-7)
3427             "pxor %%mm7, %%mm7           \n\t"
3428             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3429             "psubw %%mm3, %%mm4          \n\t"
3430             // pbv = p - b = (a + b - c) - b = a - c
3431             "movq %%mm1, %%mm5           \n\t"
3432             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3433             "movq %%mm4, %%mm6           \n\t"
3434             "psubw %%mm3, %%mm5          \n\t"
3435             "pxor %%mm0, %%mm0           \n\t"
3436             "paddw %%mm5, %%mm6          \n\t"
3438             // pa = abs(p-a) = abs(pav)
3439             // pb = abs(p-b) = abs(pbv)
3440             // pc = abs(p-c) = abs(pcv)
3441             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3442             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3443             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3444             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3445             "psubw %%mm0, %%mm4          \n\t"
3446             "psubw %%mm7, %%mm5          \n\t"
3447             "psubw %%mm0, %%mm4          \n\t"
3448             "psubw %%mm7, %%mm5          \n\t"
3449             "pxor %%mm0, %%mm0           \n\t"
3450             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3451             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3452             "psubw %%mm0, %%mm6          \n\t"
3453             //  test pa <= pb
3454             "movq %%mm4, %%mm7           \n\t"
3455             "psubw %%mm0, %%mm6          \n\t"
3456             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3457             "movq %%mm7, %%mm0           \n\t"
3458             // use mm0 mask copy to merge a & b
3459             "pand %%mm0, %%mm2           \n\t"
3460             // use mm7 mask to merge pa & pb
3461             "pand %%mm7, %%mm5           \n\t"
3462             "pandn %%mm1, %%mm0          \n\t"
3463             "pandn %%mm4, %%mm7          \n\t"
3464             "paddw %%mm2, %%mm0          \n\t"
3465             "paddw %%mm5, %%mm7          \n\t"
3466             //  test  ((pa <= pb)? pa:pb) <= pc
3467             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3468             "pand %%mm7, %%mm3           \n\t"
3469             "pandn %%mm0, %%mm7          \n\t"
3470             "paddw %%mm3, %%mm7          \n\t"
3471             "pxor %%mm1, %%mm1           \n\t"
3472             "packuswb %%mm7, %%mm1       \n\t"
3473             // step ecx to next set of 8 bytes and repeat loop til done
3474             "addl $8, %%ecx              \n\t"
3475             "pand _ActiveMaskEnd, %%mm1  \n\t"
3476             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3478             "cmpl _MMXLength, %%ecx      \n\t"
3479             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
3480             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3481                                  // mm1 will be used as Raw(x-bpp) next loop
3482                            // mm3 ready to be used as Prior(x-bpp) next loop
3483             "jb paeth_3lp                \n\t"
3485             : "=S" (dummy_value_S),             // output regs (dummy)
3486               "=D" (dummy_value_D)
3488             : "0" (prev_row),  // esi           // input regs
3489               "1" (row)        // edi
3491             : "%ecx"                            // clobber list
3492 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3493             , "%mm0", "%mm1", "%mm2", "%mm3"
3494             , "%mm4", "%mm5", "%mm6", "%mm7"
3495 #endif
3496          );
3497       }
3498       break;  // end 3 bpp
3500       case 6:
3501       //case 7:   // GRR BOGUS
3502       //case 5:   // GRR BOGUS
3503       {
3504          _ActiveMask.use  = 0x00000000ffffffffLL;
3505          _ActiveMask2.use = 0xffffffff00000000LL;
3506          _ShiftBpp.use = bpp << 3;    // == bpp * 8
3507          _ShiftRem.use = 64 - _ShiftBpp.use;
3509          __asm__ __volatile__ (
3510             "movl _dif, %%ecx            \n\t"
3511 // preload  "movl row, %%edi             \n\t"
3512 // preload  "movl prev_row, %%esi        \n\t"
3513             // prime the pump:  load the first Raw(x-bpp) data set
3514             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3515             "pxor %%mm0, %%mm0           \n\t"
3517          "paeth_6lp:                     \n\t"
3518             // must shift to position Raw(x-bpp) data
3519             "psrlq _ShiftRem, %%mm1      \n\t"
3520             // do first set of 4 bytes
3521             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3522             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3523             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3524             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3525             // must shift to position Prior(x-bpp) data
3526             "psrlq _ShiftRem, %%mm3      \n\t"
3527             // pav = p - a = (a + b - c) - a = b - c
3528             "movq %%mm2, %%mm4           \n\t"
3529             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
3530             // pbv = p - b = (a + b - c) - b = a - c
3531             "movq %%mm1, %%mm5           \n\t"
3532             "psubw %%mm3, %%mm4          \n\t"
3533             "pxor %%mm7, %%mm7           \n\t"
3534             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3535             "movq %%mm4, %%mm6           \n\t"
3536             "psubw %%mm3, %%mm5          \n\t"
3537             // pa = abs(p-a) = abs(pav)
3538             // pb = abs(p-b) = abs(pbv)
3539             // pc = abs(p-c) = abs(pcv)
3540             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3541             "paddw %%mm5, %%mm6          \n\t"
3542             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3543             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3544             "psubw %%mm0, %%mm4          \n\t"
3545             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3546             "psubw %%mm0, %%mm4          \n\t"
3547             "psubw %%mm7, %%mm5          \n\t"
3548             "pxor %%mm0, %%mm0           \n\t"
3549             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3550             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3551             "psubw %%mm7, %%mm5          \n\t"
3552             "psubw %%mm0, %%mm6          \n\t"
3553             //  test pa <= pb
3554             "movq %%mm4, %%mm7           \n\t"
3555             "psubw %%mm0, %%mm6          \n\t"
3556             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3557             "movq %%mm7, %%mm0           \n\t"
3558             // use mm7 mask to merge pa & pb
3559             "pand %%mm7, %%mm5           \n\t"
3560             // use mm0 mask copy to merge a & b
3561             "pand %%mm0, %%mm2           \n\t"
3562             "pandn %%mm4, %%mm7          \n\t"
3563             "pandn %%mm1, %%mm0          \n\t"
3564             "paddw %%mm5, %%mm7          \n\t"
3565             "paddw %%mm2, %%mm0          \n\t"
3566             //  test  ((pa <= pb)? pa:pb) <= pc
3567             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3568             "pxor %%mm1, %%mm1           \n\t"
3569             "pand %%mm7, %%mm3           \n\t"
3570             "pandn %%mm0, %%mm7          \n\t"
3571             "paddw %%mm3, %%mm7          \n\t"
3572             "pxor %%mm0, %%mm0           \n\t"
3573             "packuswb %%mm1, %%mm7       \n\t"
3574             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3575             "pand _ActiveMask, %%mm7     \n\t"
3576             "psrlq _ShiftRem, %%mm3      \n\t"
3577             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
3578             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3579             "movq %%mm2, %%mm6           \n\t"
3580             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3581             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3582             "psllq _ShiftBpp, %%mm6      \n\t"
3583             "movq %%mm7, %%mm5           \n\t"
3584             "psrlq _ShiftRem, %%mm1      \n\t"
3585             "por %%mm6, %%mm3            \n\t"
3586             "psllq _ShiftBpp, %%mm5      \n\t"
3587             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3588             "por %%mm5, %%mm1            \n\t"
3589             // do second set of 4 bytes
3590             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3591             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3592             // pav = p - a = (a + b - c) - a = b - c
3593             "movq %%mm2, %%mm4           \n\t"
3594             // pbv = p - b = (a + b - c) - b = a - c
3595             "movq %%mm1, %%mm5           \n\t"
3596             "psubw %%mm3, %%mm4          \n\t"
3597             "pxor %%mm7, %%mm7           \n\t"
3598             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3599             "movq %%mm4, %%mm6           \n\t"
3600             "psubw %%mm3, %%mm5          \n\t"
3601             // pa = abs(p-a) = abs(pav)
3602             // pb = abs(p-b) = abs(pbv)
3603             // pc = abs(p-c) = abs(pcv)
3604             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3605             "paddw %%mm5, %%mm6          \n\t"
3606             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3607             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3608             "psubw %%mm0, %%mm4          \n\t"
3609             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3610             "psubw %%mm0, %%mm4          \n\t"
3611             "psubw %%mm7, %%mm5          \n\t"
3612             "pxor %%mm0, %%mm0           \n\t"
3613             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3614             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3615             "psubw %%mm7, %%mm5          \n\t"
3616             "psubw %%mm0, %%mm6          \n\t"
3617             //  test pa <= pb
3618             "movq %%mm4, %%mm7           \n\t"
3619             "psubw %%mm0, %%mm6          \n\t"
3620             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3621             "movq %%mm7, %%mm0           \n\t"
3622             // use mm7 mask to merge pa & pb
3623             "pand %%mm7, %%mm5           \n\t"
3624             // use mm0 mask copy to merge a & b
3625             "pand %%mm0, %%mm2           \n\t"
3626             "pandn %%mm4, %%mm7          \n\t"
3627             "pandn %%mm1, %%mm0          \n\t"
3628             "paddw %%mm5, %%mm7          \n\t"
3629             "paddw %%mm2, %%mm0          \n\t"
3630             //  test  ((pa <= pb)? pa:pb) <= pc
3631             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3632             "pxor %%mm1, %%mm1           \n\t"
3633             "pand %%mm7, %%mm3           \n\t"
3634             "pandn %%mm0, %%mm7          \n\t"
3635             "pxor %%mm1, %%mm1           \n\t"
3636             "paddw %%mm3, %%mm7          \n\t"
3637             "pxor %%mm0, %%mm0           \n\t"
3638             // step ecx to next set of 8 bytes and repeat loop til done
3639             "addl $8, %%ecx              \n\t"
3640             "packuswb %%mm7, %%mm1       \n\t"
3641             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3642             "cmpl _MMXLength, %%ecx      \n\t"
3643             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3644                                 // mm1 will be used as Raw(x-bpp) next loop
3645             "jb paeth_6lp                \n\t"
3647             : "=S" (dummy_value_S),             // output regs (dummy)
3648               "=D" (dummy_value_D)
3650             : "0" (prev_row),  // esi           // input regs
3651               "1" (row)        // edi
3653             : "%ecx"                            // clobber list
3654 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3655             , "%mm0", "%mm1", "%mm2", "%mm3"
3656             , "%mm4", "%mm5", "%mm6", "%mm7"
3657 #endif
3658          );
3659       }
3660       break;  // end 6 bpp
3662       case 4:
3663       {
3664          _ActiveMask.use  = 0x00000000ffffffffLL;
3666          __asm__ __volatile__ (
3667             "movl _dif, %%ecx            \n\t"
3668 // preload  "movl row, %%edi             \n\t"
3669 // preload  "movl prev_row, %%esi        \n\t"
3670             "pxor %%mm0, %%mm0           \n\t"
3671             // prime the pump:  load the first Raw(x-bpp) data set
3672             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3673                                      //  a=Raw(x-bpp) bytes
3674          "paeth_4lp:                     \n\t"
3675             // do first set of 4 bytes
3676             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3677             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3678             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3679             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3680             // pav = p - a = (a + b - c) - a = b - c
3681             "movq %%mm2, %%mm4           \n\t"
3682             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3683             // pbv = p - b = (a + b - c) - b = a - c
3684             "movq %%mm1, %%mm5           \n\t"
3685             "psubw %%mm3, %%mm4          \n\t"
3686             "pxor %%mm7, %%mm7           \n\t"
3687             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3688             "movq %%mm4, %%mm6           \n\t"
3689             "psubw %%mm3, %%mm5          \n\t"
3690             // pa = abs(p-a) = abs(pav)
3691             // pb = abs(p-b) = abs(pbv)
3692             // pc = abs(p-c) = abs(pcv)
3693             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3694             "paddw %%mm5, %%mm6          \n\t"
3695             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3696             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3697             "psubw %%mm0, %%mm4          \n\t"
3698             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3699             "psubw %%mm0, %%mm4          \n\t"
3700             "psubw %%mm7, %%mm5          \n\t"
3701             "pxor %%mm0, %%mm0           \n\t"
3702             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3703             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3704             "psubw %%mm7, %%mm5          \n\t"
3705             "psubw %%mm0, %%mm6          \n\t"
3706             //  test pa <= pb
3707             "movq %%mm4, %%mm7           \n\t"
3708             "psubw %%mm0, %%mm6          \n\t"
3709             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3710             "movq %%mm7, %%mm0           \n\t"
3711             // use mm7 mask to merge pa & pb
3712             "pand %%mm7, %%mm5           \n\t"
3713             // use mm0 mask copy to merge a & b
3714             "pand %%mm0, %%mm2           \n\t"
3715             "pandn %%mm4, %%mm7          \n\t"
3716             "pandn %%mm1, %%mm0          \n\t"
3717             "paddw %%mm5, %%mm7          \n\t"
3718             "paddw %%mm2, %%mm0          \n\t"
3719             //  test  ((pa <= pb)? pa:pb) <= pc
3720             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3721             "pxor %%mm1, %%mm1           \n\t"
3722             "pand %%mm7, %%mm3           \n\t"
3723             "pandn %%mm0, %%mm7          \n\t"
3724             "paddw %%mm3, %%mm7          \n\t"
3725             "pxor %%mm0, %%mm0           \n\t"
3726             "packuswb %%mm1, %%mm7       \n\t"
3727             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3728             "pand _ActiveMask, %%mm7     \n\t"
3729             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
3730             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3731             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3732             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3733             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
3734             // do second set of 4 bytes
3735             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3736             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3737             // pav = p - a = (a + b - c) - a = b - c
3738             "movq %%mm2, %%mm4           \n\t"
3739             // pbv = p - b = (a + b - c) - b = a - c
3740             "movq %%mm1, %%mm5           \n\t"
3741             "psubw %%mm3, %%mm4          \n\t"
3742             "pxor %%mm7, %%mm7           \n\t"
3743             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3744             "movq %%mm4, %%mm6           \n\t"
3745             "psubw %%mm3, %%mm5          \n\t"
3746             // pa = abs(p-a) = abs(pav)
3747             // pb = abs(p-b) = abs(pbv)
3748             // pc = abs(p-c) = abs(pcv)
3749             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3750             "paddw %%mm5, %%mm6          \n\t"
3751             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3752             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3753             "psubw %%mm0, %%mm4          \n\t"
3754             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3755             "psubw %%mm0, %%mm4          \n\t"
3756             "psubw %%mm7, %%mm5          \n\t"
3757             "pxor %%mm0, %%mm0           \n\t"
3758             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3759             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3760             "psubw %%mm7, %%mm5          \n\t"
3761             "psubw %%mm0, %%mm6          \n\t"
3762             //  test pa <= pb
3763             "movq %%mm4, %%mm7           \n\t"
3764             "psubw %%mm0, %%mm6          \n\t"
3765             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3766             "movq %%mm7, %%mm0           \n\t"
3767             // use mm7 mask to merge pa & pb
3768             "pand %%mm7, %%mm5           \n\t"
3769             // use mm0 mask copy to merge a & b
3770             "pand %%mm0, %%mm2           \n\t"
3771             "pandn %%mm4, %%mm7          \n\t"
3772             "pandn %%mm1, %%mm0          \n\t"
3773             "paddw %%mm5, %%mm7          \n\t"
3774             "paddw %%mm2, %%mm0          \n\t"
3775             //  test  ((pa <= pb)? pa:pb) <= pc
3776             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3777             "pxor %%mm1, %%mm1           \n\t"
3778             "pand %%mm7, %%mm3           \n\t"
3779             "pandn %%mm0, %%mm7          \n\t"
3780             "pxor %%mm1, %%mm1           \n\t"
3781             "paddw %%mm3, %%mm7          \n\t"
3782             "pxor %%mm0, %%mm0           \n\t"
3783             // step ecx to next set of 8 bytes and repeat loop til done
3784             "addl $8, %%ecx              \n\t"
3785             "packuswb %%mm7, %%mm1       \n\t"
3786             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
3787             "cmpl _MMXLength, %%ecx      \n\t"
3788             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3789                                 // mm1 will be used as Raw(x-bpp) next loop
3790             "jb paeth_4lp                \n\t"
3792             : "=S" (dummy_value_S),             // output regs (dummy)
3793               "=D" (dummy_value_D)
3795             : "0" (prev_row),  // esi           // input regs
3796               "1" (row)        // edi
3798             : "%ecx"                            // clobber list
3799 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3800             , "%mm0", "%mm1", "%mm2", "%mm3"
3801             , "%mm4", "%mm5", "%mm6", "%mm7"
3802 #endif
3803          );
3804       }
3805       break;  // end 4 bpp
3807       case 8:                          // bpp == 8
3808       {
3809          _ActiveMask.use  = 0x00000000ffffffffLL;
3811          __asm__ __volatile__ (
3812             "movl _dif, %%ecx            \n\t"
3813 // preload  "movl row, %%edi             \n\t"
3814 // preload  "movl prev_row, %%esi        \n\t"
3815             "pxor %%mm0, %%mm0           \n\t"
3816             // prime the pump:  load the first Raw(x-bpp) data set
3817             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3818                                        //  a=Raw(x-bpp) bytes
3819          "paeth_8lp:                     \n\t"
3820             // do first set of 4 bytes
3821             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3822             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3823             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3824             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3825             // pav = p - a = (a + b - c) - a = b - c
3826             "movq %%mm2, %%mm4           \n\t"
3827             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
3828             // pbv = p - b = (a + b - c) - b = a - c
3829             "movq %%mm1, %%mm5           \n\t"
3830             "psubw %%mm3, %%mm4          \n\t"
3831             "pxor %%mm7, %%mm7           \n\t"
3832             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3833             "movq %%mm4, %%mm6           \n\t"
3834             "psubw %%mm3, %%mm5          \n\t"
3835             // pa = abs(p-a) = abs(pav)
3836             // pb = abs(p-b) = abs(pbv)
3837             // pc = abs(p-c) = abs(pcv)
3838             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3839             "paddw %%mm5, %%mm6          \n\t"
3840             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3841             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3842             "psubw %%mm0, %%mm4          \n\t"
3843             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3844             "psubw %%mm0, %%mm4          \n\t"
3845             "psubw %%mm7, %%mm5          \n\t"
3846             "pxor %%mm0, %%mm0           \n\t"
3847             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3848             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3849             "psubw %%mm7, %%mm5          \n\t"
3850             "psubw %%mm0, %%mm6          \n\t"
3851             //  test pa <= pb
3852             "movq %%mm4, %%mm7           \n\t"
3853             "psubw %%mm0, %%mm6          \n\t"
3854             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3855             "movq %%mm7, %%mm0           \n\t"
3856             // use mm7 mask to merge pa & pb
3857             "pand %%mm7, %%mm5           \n\t"
3858             // use mm0 mask copy to merge a & b
3859             "pand %%mm0, %%mm2           \n\t"
3860             "pandn %%mm4, %%mm7          \n\t"
3861             "pandn %%mm1, %%mm0          \n\t"
3862             "paddw %%mm5, %%mm7          \n\t"
3863             "paddw %%mm2, %%mm0          \n\t"
3864             //  test  ((pa <= pb)? pa:pb) <= pc
3865             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3866             "pxor %%mm1, %%mm1           \n\t"
3867             "pand %%mm7, %%mm3           \n\t"
3868             "pandn %%mm0, %%mm7          \n\t"
3869             "paddw %%mm3, %%mm7          \n\t"
3870             "pxor %%mm0, %%mm0           \n\t"
3871             "packuswb %%mm1, %%mm7       \n\t"
3872             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3873             "pand _ActiveMask, %%mm7     \n\t"
3874             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3875             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3876             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3877             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3878             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
3880             // do second set of 4 bytes
3881             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3882             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3883             // pav = p - a = (a + b - c) - a = b - c
3884             "movq %%mm2, %%mm4           \n\t"
3885             // pbv = p - b = (a + b - c) - b = a - c
3886             "movq %%mm1, %%mm5           \n\t"
3887             "psubw %%mm3, %%mm4          \n\t"
3888             "pxor %%mm7, %%mm7           \n\t"
3889             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3890             "movq %%mm4, %%mm6           \n\t"
3891             "psubw %%mm3, %%mm5          \n\t"
3892             // pa = abs(p-a) = abs(pav)
3893             // pb = abs(p-b) = abs(pbv)
3894             // pc = abs(p-c) = abs(pcv)
3895             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3896             "paddw %%mm5, %%mm6          \n\t"
3897             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3898             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3899             "psubw %%mm0, %%mm4          \n\t"
3900             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3901             "psubw %%mm0, %%mm4          \n\t"
3902             "psubw %%mm7, %%mm5          \n\t"
3903             "pxor %%mm0, %%mm0           \n\t"
3904             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3905             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3906             "psubw %%mm7, %%mm5          \n\t"
3907             "psubw %%mm0, %%mm6          \n\t"
3908             //  test pa <= pb
3909             "movq %%mm4, %%mm7           \n\t"
3910             "psubw %%mm0, %%mm6          \n\t"
3911             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3912             "movq %%mm7, %%mm0           \n\t"
3913             // use mm7 mask to merge pa & pb
3914             "pand %%mm7, %%mm5           \n\t"
3915             // use mm0 mask copy to merge a & b
3916             "pand %%mm0, %%mm2           \n\t"
3917             "pandn %%mm4, %%mm7          \n\t"
3918             "pandn %%mm1, %%mm0          \n\t"
3919             "paddw %%mm5, %%mm7          \n\t"
3920             "paddw %%mm2, %%mm0          \n\t"
3921             //  test  ((pa <= pb)? pa:pb) <= pc
3922             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3923             "pxor %%mm1, %%mm1           \n\t"
3924             "pand %%mm7, %%mm3           \n\t"
3925             "pandn %%mm0, %%mm7          \n\t"
3926             "pxor %%mm1, %%mm1           \n\t"
3927             "paddw %%mm3, %%mm7          \n\t"
3928             "pxor %%mm0, %%mm0           \n\t"
3929             // step ecx to next set of 8 bytes and repeat loop til done
3930             "addl $8, %%ecx              \n\t"
3931             "packuswb %%mm7, %%mm1       \n\t"
3932             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3933             "cmpl _MMXLength, %%ecx      \n\t"
3934             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3935                             // mm1 will be used as Raw(x-bpp) next loop
3936             "jb paeth_8lp                \n\t"
3938             : "=S" (dummy_value_S),             // output regs (dummy)
3939               "=D" (dummy_value_D)
3941             : "0" (prev_row),  // esi           // input regs
3942               "1" (row)        // edi
3944             : "%ecx"                            // clobber list
3945 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3946             , "%mm0", "%mm1", "%mm2", "%mm3"
3947             , "%mm4", "%mm5", "%mm6", "%mm7"
3948 #endif
3949          );
3950       }
3951       break;  // end 8 bpp
3953       case 1:                // bpp = 1
3954       case 2:                // bpp = 2
3955       default:               // bpp > 8
3956       {
3957          __asm__ __volatile__ (
3958 #ifdef __PIC__
3959             "pushl %%ebx                 \n\t" // save Global Offset Table index
3960 #endif
3961             "movl _dif, %%ebx            \n\t"
3962             "cmpl _FullLength, %%ebx     \n\t"
3963             "jnb paeth_dend              \n\t"
3965 // preload  "movl row, %%edi             \n\t"
3966 // preload  "movl prev_row, %%esi        \n\t"
3967             // do Paeth decode for remaining bytes
3968             "movl %%ebx, %%edx           \n\t"
3969 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3970             "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
3971             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
3973          "paeth_dlp:                     \n\t"
3974             "xorl %%eax, %%eax           \n\t"
3975             // pav = p - a = (a + b - c) - a = b - c
3976             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
3977             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3978             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3979             "movl %%eax, _patemp         \n\t" // Save pav for later use
3980             "xorl %%eax, %%eax           \n\t"
3981             // pbv = p - b = (a + b - c) - b = a - c
3982             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
3983             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3984             "movl %%eax, %%ecx           \n\t"
3985             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3986             "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
3987             // pc = abs(pcv)
3988             "testl $0x80000000, %%eax    \n\t"
3989             "jz paeth_dpca               \n\t"
3990             "negl %%eax                  \n\t" // reverse sign of neg values
3992          "paeth_dpca:                    \n\t"
3993             "movl %%eax, _pctemp         \n\t" // save pc for later use
3994             // pb = abs(pbv)
3995             "testl $0x80000000, %%ecx    \n\t"
3996             "jz paeth_dpba               \n\t"
3997             "negl %%ecx                  \n\t" // reverse sign of neg values
3999          "paeth_dpba:                    \n\t"
4000             "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4001             // pa = abs(pav)
4002             "movl _patemp, %%eax         \n\t"
4003             "testl $0x80000000, %%eax    \n\t"
4004             "jz paeth_dpaa               \n\t"
4005             "negl %%eax                  \n\t" // reverse sign of neg values
4007          "paeth_dpaa:                    \n\t"
4008             "movl %%eax, _patemp         \n\t" // save pa for later use
4009             // test if pa <= pb
4010             "cmpl %%ecx, %%eax           \n\t"
4011             "jna paeth_dabb              \n\t"
4012             // pa > pb; now test if pb <= pc
4013             "cmpl _pctemp, %%ecx         \n\t"
4014             "jna paeth_dbbc              \n\t"
4015             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4016             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4017             "jmp paeth_dpaeth            \n\t"
4019          "paeth_dbbc:                    \n\t"
4020             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4021             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4022             "jmp paeth_dpaeth            \n\t"
4024          "paeth_dabb:                    \n\t"
4025             // pa <= pb; now test if pa <= pc
4026             "cmpl _pctemp, %%eax         \n\t"
4027             "jna paeth_dabc              \n\t"
4028             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4029             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4030             "jmp paeth_dpaeth            \n\t"
4032          "paeth_dabc:                    \n\t"
4033             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4034             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4036          "paeth_dpaeth:                  \n\t"
4037             "incl %%ebx                  \n\t"
4038             "incl %%edx                  \n\t"
4039             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4040             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4041             "cmpl _FullLength, %%ebx     \n\t"
4042             "jb paeth_dlp                \n\t"
4044          "paeth_dend:                    \n\t"
4045 #ifdef __PIC__
4046             "popl %%ebx                  \n\t" // index to Global Offset Table
4047 #endif
4049             : "=c" (dummy_value_c),            // output regs (dummy)
4050               "=S" (dummy_value_S),
4051               "=D" (dummy_value_D)
4053             : "0" (bpp),       // ecx          // input regs
4054               "1" (prev_row),  // esi
4055               "2" (row)        // edi
4057             : "%eax", "%edx"                   // clobber list
4058 #ifndef __PIC__
4059             , "%ebx"
4060 #endif
4061          );
4062       }
4063       return;                   // No need to go further with this one
4065    } // end switch (bpp)
4067    __asm__ __volatile__ (
4068       // MMX acceleration complete; now do clean-up
4069       // check if any remaining bytes left to decode
4070 #ifdef __PIC__
4071       "pushl %%ebx                 \n\t" // save index to Global Offset Table
4072 #endif
4073       "movl _MMXLength, %%ebx      \n\t"
4074       "cmpl _FullLength, %%ebx     \n\t"
4075       "jnb paeth_end               \n\t"
4076 //pre "movl row, %%edi             \n\t"
4077 //pre "movl prev_row, %%esi        \n\t"
4078       // do Paeth decode for remaining bytes
4079       "movl %%ebx, %%edx           \n\t"
4080 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4081       "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4082       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
4084    "paeth_lp2:                     \n\t"
4085       "xorl %%eax, %%eax           \n\t"
4086       // pav = p - a = (a + b - c) - a = b - c
4087       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4088       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4089       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4090       "movl %%eax, _patemp         \n\t" // Save pav for later use
4091       "xorl %%eax, %%eax           \n\t"
4092       // pbv = p - b = (a + b - c) - b = a - c
4093       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4094       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4095       "movl %%eax, %%ecx           \n\t"
4096       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4097       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4098       // pc = abs(pcv)
4099       "testl $0x80000000, %%eax    \n\t"
4100       "jz paeth_pca2               \n\t"
4101       "negl %%eax                  \n\t" // reverse sign of neg values
4103    "paeth_pca2:                    \n\t"
4104       "movl %%eax, _pctemp         \n\t" // save pc for later use
4105       // pb = abs(pbv)
4106       "testl $0x80000000, %%ecx    \n\t"
4107       "jz paeth_pba2               \n\t"
4108       "negl %%ecx                  \n\t" // reverse sign of neg values
4110    "paeth_pba2:                    \n\t"
4111       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4112       // pa = abs(pav)
4113       "movl _patemp, %%eax         \n\t"
4114       "testl $0x80000000, %%eax    \n\t"
4115       "jz paeth_paa2               \n\t"
4116       "negl %%eax                  \n\t" // reverse sign of neg values
4118    "paeth_paa2:                    \n\t"
4119       "movl %%eax, _patemp         \n\t" // save pa for later use
4120       // test if pa <= pb
4121       "cmpl %%ecx, %%eax           \n\t"
4122       "jna paeth_abb2              \n\t"
4123       // pa > pb; now test if pb <= pc
4124       "cmpl _pctemp, %%ecx         \n\t"
4125       "jna paeth_bbc2              \n\t"
4126       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4127       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4128       "jmp paeth_paeth2            \n\t"
4130    "paeth_bbc2:                    \n\t"
4131       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4132       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4133       "jmp paeth_paeth2            \n\t"
4135    "paeth_abb2:                    \n\t"
4136       // pa <= pb; now test if pa <= pc
4137       "cmpl _pctemp, %%eax         \n\t"
4138       "jna paeth_abc2              \n\t"
4139       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4140       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4141       "jmp paeth_paeth2            \n\t"
4143    "paeth_abc2:                    \n\t"
4144       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4145       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4147    "paeth_paeth2:                  \n\t"
4148       "incl %%ebx                  \n\t"
4149       "incl %%edx                  \n\t"
4150       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4151       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4152       "cmpl _FullLength, %%ebx     \n\t"
4153       "jb paeth_lp2                \n\t"
4155    "paeth_end:                     \n\t"
4156       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
4157 #ifdef __PIC__
4158       "popl %%ebx                  \n\t" // restore index to Global Offset Table
4159 #endif
4161       : "=c" (dummy_value_c),            // output regs (dummy)
4162         "=S" (dummy_value_S),
4163         "=D" (dummy_value_D)
4165       : "0" (bpp),       // ecx          // input regs
4166         "1" (prev_row),  // esi
4167         "2" (row)        // edi
4169       : "%eax", "%edx"                   // clobber list (no input regs!)
4170 #ifndef __PIC__
4171       , "%ebx"
4172 #endif
4173    );
4175 } /* end png_read_filter_row_mmx_paeth() */
4180 //===========================================================================//
4181 //                                                                           //
4182 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
4183 //                                                                           //
4184 //===========================================================================//
4186 // Optimized code for PNG Sub filter decoder
4188 static void /* PRIVATE */
4189 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4191    int bpp;
4192    int dummy_value_a;
4193    int dummy_value_D;
4195    bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
4196    _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
4198    __asm__ __volatile__ (
4199 //pre "movl row, %%edi             \n\t"
4200       "movl %%edi, %%esi           \n\t" // lp = row
4201 //pre "movl bpp, %%eax             \n\t"
4202       "addl %%eax, %%edi           \n\t" // rp = row + bpp
4203 //irr "xorl %%eax, %%eax           \n\t"
4204       // get # of bytes to alignment
4205       "movl %%edi, _dif            \n\t" // take start of row
4206       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
4207                                          //  alignment boundary
4208       "xorl %%ecx, %%ecx           \n\t"
4209       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
4210       "subl %%edi, _dif            \n\t" // subtract from start ==> value
4211       "jz sub_go                   \n\t" //  ecx at alignment
4213    "sub_lp1:                       \n\t" // fix alignment
4214       "movb (%%esi,%%ecx,), %%al   \n\t"
4215       "addb %%al, (%%edi,%%ecx,)   \n\t"
4216       "incl %%ecx                  \n\t"
4217       "cmpl _dif, %%ecx            \n\t"
4218       "jb sub_lp1                  \n\t"
4220    "sub_go:                        \n\t"
4221       "movl _FullLength, %%eax     \n\t"
4222       "movl %%eax, %%edx           \n\t"
4223       "subl %%ecx, %%edx           \n\t" // subtract alignment fix
4224       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
4225       "subl %%edx, %%eax           \n\t" // drop over bytes from length
4226       "movl %%eax, _MMXLength      \n\t"
4228       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4229         "=D" (dummy_value_D)    // 1
4231       : "0" (bpp),              // eax    // input regs
4232         "1" (row)               // edi
4234       : "%ebx", "%ecx", "%edx"            // clobber list
4235       , "%esi"
4237 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4238       , "%mm0", "%mm1", "%mm2", "%mm3"
4239       , "%mm4", "%mm5", "%mm6", "%mm7"
4240 #endif
4241    );
4243    // now do the math for the rest of the row
4244    switch (bpp)
4245    {
4246       case 3:
4247       {
4248          _ActiveMask.use  = 0x0000ffffff000000LL;
4249          _ShiftBpp.use = 24;       // == 3 * 8
4250          _ShiftRem.use  = 40;      // == 64 - 24
4252          __asm__ __volatile__ (
4253 // preload  "movl row, %%edi              \n\t"
4254             "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
4255                                                 //  active byte group
4256             "movl %%edi, %%esi            \n\t" // lp = row
4257 // preload  "movl bpp, %%eax              \n\t"
4258             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4259             "movq %%mm7, %%mm6            \n\t"
4260             "movl _dif, %%edx             \n\t"
4261             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4262                                                 //  3rd active byte group
4263             // prime the pump:  load the first Raw(x-bpp) data set
4264             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4266          "sub_3lp:                        \n\t" // shift data for adding first
4267             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4268                                                 //  shift clears inactive bytes)
4269             // add 1st active group
4270             "movq (%%edi,%%edx,), %%mm0   \n\t"
4271             "paddb %%mm1, %%mm0           \n\t"
4273             // add 2nd active group
4274             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4275             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4276             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4277             "paddb %%mm1, %%mm0           \n\t"
4279             // add 3rd active group
4280             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4281             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4282             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4283             "addl $8, %%edx               \n\t"
4284             "paddb %%mm1, %%mm0           \n\t"
4286             "cmpl _MMXLength, %%edx       \n\t"
4287             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4288             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4289             "jb sub_3lp                   \n\t"
4291             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4292               "=D" (dummy_value_D)    // 1
4294             : "0" (bpp),              // eax    // input regs
4295               "1" (row)               // edi
4297             : "%edx", "%esi"                    // clobber list
4298 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4299             , "%mm0", "%mm1", "%mm6", "%mm7"
4300 #endif
4301          );
4302       }
4303       break;
4305       case 1:
4306       {
4307          __asm__ __volatile__ (
4308             "movl _dif, %%edx            \n\t"
4309 // preload  "movl row, %%edi             \n\t"
4310             "cmpl _FullLength, %%edx     \n\t"
4311             "jnb sub_1end                \n\t"
4312             "movl %%edi, %%esi           \n\t" // lp = row
4313             "xorl %%eax, %%eax           \n\t"
4314 // preload  "movl bpp, %%eax             \n\t"
4315             "addl %%eax, %%edi           \n\t" // rp = row + bpp
4317          "sub_1lp:                       \n\t"
4318             "movb (%%esi,%%edx,), %%al   \n\t"
4319             "addb %%al, (%%edi,%%edx,)   \n\t"
4320             "incl %%edx                  \n\t"
4321             "cmpl _FullLength, %%edx     \n\t"
4322             "jb sub_1lp                  \n\t"
4324          "sub_1end:                      \n\t"
4326             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4327               "=D" (dummy_value_D)    // 1
4329             : "0" (bpp),              // eax    // input regs
4330               "1" (row)               // edi
4332             : "%edx", "%esi"                    // clobber list
4333          );
4334       }
4335       return;
4337       case 6:
4338       case 4:
4339       //case 7:   // GRR BOGUS
4340       //case 5:   // GRR BOGUS
4341       {
4342          _ShiftBpp.use = bpp << 3;
4343          _ShiftRem.use = 64 - _ShiftBpp.use;
4345          __asm__ __volatile__ (
4346 // preload  "movl row, %%edi              \n\t"
4347             "movl _dif, %%edx             \n\t"
4348             "movl %%edi, %%esi            \n\t" // lp = row
4349 // preload  "movl bpp, %%eax              \n\t"
4350             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4352             // prime the pump:  load the first Raw(x-bpp) data set
4353             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4355          "sub_4lp:                        \n\t" // shift data for adding first
4356             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4357                                                 //  shift clears inactive bytes)
4358             "movq (%%edi,%%edx,), %%mm0   \n\t"
4359             "paddb %%mm1, %%mm0           \n\t"
4361             // add 2nd active group
4362             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4363             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4364             "addl $8, %%edx               \n\t"
4365             "paddb %%mm1, %%mm0           \n\t"
4367             "cmpl _MMXLength, %%edx       \n\t"
4368             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4369             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4370             "jb sub_4lp                   \n\t"
4372             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4373               "=D" (dummy_value_D)    // 1
4375             : "0" (bpp),              // eax    // input regs
4376               "1" (row)               // edi
4378             : "%edx", "%esi"                    // clobber list
4379 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4380             , "%mm0", "%mm1"
4381 #endif
4382          );
4383       }
4384       break;
4386       case 2:
4387       {
4388          _ActiveMask.use = 0x00000000ffff0000LL;
4389          _ShiftBpp.use = 16;       // == 2 * 8
4390          _ShiftRem.use = 48;       // == 64 - 16
4392          __asm__ __volatile__ (
4393             "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
4394                                                 //  active byte group
4395             "movl _dif, %%edx             \n\t"
4396             "movq %%mm7, %%mm6            \n\t"
4397 // preload  "movl row, %%edi              \n\t"
4398             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4399                                                 //  3rd active byte group
4400             "movl %%edi, %%esi            \n\t" // lp = row
4401             "movq %%mm6, %%mm5            \n\t"
4402 // preload  "movl bpp, %%eax              \n\t"
4403             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4404             "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
4405                                                 //  4th active byte group
4406             // prime the pump:  load the first Raw(x-bpp) data set
4407             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4409          "sub_2lp:                        \n\t" // shift data for adding first
4410             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4411                                                 //  shift clears inactive bytes)
4412             // add 1st active group
4413             "movq (%%edi,%%edx,), %%mm0   \n\t"
4414             "paddb %%mm1, %%mm0           \n\t"
4416             // add 2nd active group
4417             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4418             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4419             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4420             "paddb %%mm1, %%mm0           \n\t"
4422             // add 3rd active group
4423             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4424             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4425             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4426             "paddb %%mm1, %%mm0           \n\t"
4428             // add 4th active group
4429             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4430             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4431             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
4432             "addl $8, %%edx               \n\t"
4433             "paddb %%mm1, %%mm0           \n\t"
4434             "cmpl _MMXLength, %%edx       \n\t"
4435             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4436             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4437             "jb sub_2lp                   \n\t"
4439             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4440               "=D" (dummy_value_D)    // 1
4442             : "0" (bpp),              // eax    // input regs
4443               "1" (row)               // edi
4445             : "%edx", "%esi"                    // clobber list
4446 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4447             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4448 #endif
4449          );
4450       }
4451       break;
4453       case 8:
4454       {
4455          __asm__ __volatile__ (
4456 // preload  "movl row, %%edi              \n\t"
4457             "movl _dif, %%edx             \n\t"
4458             "movl %%edi, %%esi            \n\t" // lp = row
4459 // preload  "movl bpp, %%eax              \n\t"
4460             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4461             "movl _MMXLength, %%ecx       \n\t"
4463             // prime the pump:  load the first Raw(x-bpp) data set
4464             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4465             "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
4467          "sub_8lp:                        \n\t"
4468             "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
4469             "paddb %%mm7, %%mm0           \n\t"
4470             "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
4471             "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
4473             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4474             // This will be repeated for each group of 8 bytes with the 8th
4475             // group being used as the Raw(x-bpp) for the 1st group of the
4476             // next loop.
4478             "paddb %%mm0, %%mm1           \n\t"
4479             "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4480             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
4481             "paddb %%mm1, %%mm2           \n\t"
4482             "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4483             "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4484             "paddb %%mm2, %%mm3           \n\t"
4485             "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4486             "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4487             "paddb %%mm3, %%mm4           \n\t"
4488             "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4489             "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4490             "paddb %%mm4, %%mm5           \n\t"
4491             "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4492             "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4493             "paddb %%mm5, %%mm6           \n\t"
4494             "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4495             "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4496             "addl $64, %%edx              \n\t"
4497             "paddb %%mm6, %%mm7           \n\t"
4498             "cmpl %%ecx, %%edx            \n\t"
4499             "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4500             "jb sub_8lp                   \n\t"
4502             "cmpl _MMXLength, %%edx       \n\t"
4503             "jnb sub_8lt8                 \n\t"
4505          "sub_8lpA:                       \n\t"
4506             "movq (%%edi,%%edx,), %%mm0   \n\t"
4507             "addl $8, %%edx               \n\t"
4508             "paddb %%mm7, %%mm0           \n\t"
4509             "cmpl _MMXLength, %%edx       \n\t"
4510             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4511             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
4512                                                 //  to mm1 to be new Raw(x-bpp)
4513                                                 //  for next loop
4514             "jb sub_8lpA                  \n\t"
4516          "sub_8lt8:                       \n\t"
4518             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4519               "=D" (dummy_value_D)    // 1
4521             : "0" (bpp),              // eax    // input regs
4522               "1" (row)               // edi
4524             : "%ecx", "%edx", "%esi"            // clobber list
4525 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4526             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4527 #endif
4528          );
4529       }
4530       break;
4532       default:                // bpp greater than 8 bytes       GRR BOGUS
4533       {
4534          __asm__ __volatile__ (
4535             "movl _dif, %%edx             \n\t"
4536 // preload  "movl row, %%edi              \n\t"
4537             "movl %%edi, %%esi            \n\t" // lp = row
4538 // preload  "movl bpp, %%eax              \n\t"
4539             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4541          "sub_Alp:                        \n\t"
4542             "movq (%%edi,%%edx,), %%mm0   \n\t"
4543             "movq (%%esi,%%edx,), %%mm1   \n\t"
4544             "addl $8, %%edx               \n\t"
4545             "paddb %%mm1, %%mm0           \n\t"
4546             "cmpl _MMXLength, %%edx       \n\t"
4547             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4548                                                 //  -8 to offset addl edx
4549             "jb sub_Alp                   \n\t"
4551             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4552               "=D" (dummy_value_D)    // 1
4554             : "0" (bpp),              // eax    // input regs
4555               "1" (row)               // edi
4557             : "%edx", "%esi"                    // clobber list
4558 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4559             , "%mm0", "%mm1"
4560 #endif
4561          );
4562       }
4563       break;
4565    } // end switch (bpp)
4567    __asm__ __volatile__ (
4568       "movl _MMXLength, %%edx       \n\t"
4569 //pre "movl row, %%edi              \n\t"
4570       "cmpl _FullLength, %%edx      \n\t"
4571       "jnb sub_end                  \n\t"
4573       "movl %%edi, %%esi            \n\t" // lp = row
4574 //pre "movl bpp, %%eax              \n\t"
4575       "addl %%eax, %%edi            \n\t" // rp = row + bpp
4576       "xorl %%eax, %%eax            \n\t"
4578    "sub_lp2:                        \n\t"
4579       "movb (%%esi,%%edx,), %%al    \n\t"
4580       "addb %%al, (%%edi,%%edx,)    \n\t"
4581       "incl %%edx                   \n\t"
4582       "cmpl _FullLength, %%edx      \n\t"
4583       "jb sub_lp2                   \n\t"
4585    "sub_end:                        \n\t"
4586       "EMMS                         \n\t" // end MMX instructions
4588       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4589         "=D" (dummy_value_D)    // 1
4591       : "0" (bpp),              // eax    // input regs
4592         "1" (row)               // edi
4594       : "%edx", "%esi"                    // clobber list
4595    );
4597 } // end of png_read_filter_row_mmx_sub()
4602 //===========================================================================//
4603 //                                                                           //
4604 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
4605 //                                                                           //
4606 //===========================================================================//
4608 // Optimized code for PNG Up filter decoder
4610 static void /* PRIVATE */
4611 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4612                            png_bytep prev_row)
4614    png_uint_32 len;
4615    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
4616    int dummy_value_S;
4617    int dummy_value_D;
4619    len = row_info->rowbytes;              // number of bytes to filter
4621    __asm__ __volatile__ (
4622 //pre "movl row, %%edi              \n\t"
4623       // get # of bytes to alignment
4624       "movl %%edi, %%ecx            \n\t"
4625       "xorl %%ebx, %%ebx            \n\t"
4626       "addl $0x7, %%ecx             \n\t"
4627       "xorl %%eax, %%eax            \n\t"
4628       "andl $0xfffffff8, %%ecx      \n\t"
4629 //pre "movl prev_row, %%esi         \n\t"
4630       "subl %%edi, %%ecx            \n\t"
4631       "jz up_go                     \n\t"
4633    "up_lp1:                         \n\t" // fix alignment
4634       "movb (%%edi,%%ebx,), %%al    \n\t"
4635       "addb (%%esi,%%ebx,), %%al    \n\t"
4636       "incl %%ebx                   \n\t"
4637       "cmpl %%ecx, %%ebx            \n\t"
4638       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4639       "jb up_lp1                    \n\t" //  offset incl ebx
4641    "up_go:                          \n\t"
4642 //pre "movl len, %%edx              \n\t"
4643       "movl %%edx, %%ecx            \n\t"
4644       "subl %%ebx, %%edx            \n\t" // subtract alignment fix
4645       "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
4646       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4648       // unrolled loop - use all MMX registers and interleave to reduce
4649       // number of branch instructions (loops) and reduce partial stalls
4650    "up_loop:                        \n\t"
4651       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4652       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4653       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
4654       "paddb %%mm1, %%mm0           \n\t"
4655       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
4656       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
4657       "paddb %%mm3, %%mm2           \n\t"
4658       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4659       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
4660       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4661       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4662       "paddb %%mm5, %%mm4           \n\t"
4663       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4664       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4665       "paddb %%mm7, %%mm6           \n\t"
4666       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4667       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4668       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4669       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4670       "paddb %%mm1, %%mm0           \n\t"
4671       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4672       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4673       "paddb %%mm3, %%mm2           \n\t"
4674       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4675       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4676       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4677       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4678       "paddb %%mm5, %%mm4           \n\t"
4679       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4680       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4681       "addl $64, %%ebx              \n\t"
4682       "paddb %%mm7, %%mm6           \n\t"
4683       "cmpl %%ecx, %%ebx            \n\t"
4684       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4685       "jb up_loop                   \n\t" //  -8 to offset addl ebx
4687       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
4688       "jz up_end                    \n\t"
4690       "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
4691       "jb up_lt8                    \n\t" //  [added by lcreeve@netins.net]
4693       "addl %%edx, %%ecx            \n\t"
4694       "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
4695       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4696       "jz up_lt8                    \n\t"
4698    "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
4699       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4700       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4701       "addl $8, %%ebx               \n\t"
4702       "paddb %%mm1, %%mm0           \n\t"
4703       "cmpl %%ecx, %%ebx            \n\t"
4704       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4705       "jb up_lpA                    \n\t" //  offset add ebx
4706       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
4707       "jz up_end                    \n\t"
4709    "up_lt8:                         \n\t"
4710       "xorl %%eax, %%eax            \n\t"
4711       "addl %%edx, %%ecx            \n\t" // move over byte count into counter
4713    "up_lp2:                         \n\t" // use x86 regs for remaining bytes
4714       "movb (%%edi,%%ebx,), %%al    \n\t"
4715       "addb (%%esi,%%ebx,), %%al    \n\t"
4716       "incl %%ebx                   \n\t"
4717       "cmpl %%ecx, %%ebx            \n\t"
4718       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4719       "jb up_lp2                    \n\t" //  offset inc ebx
4721    "up_end:                         \n\t"
4722       "EMMS                         \n\t" // conversion of filtered row complete
4724       : "=d" (dummy_value_d),   // 0      // output regs (dummy)
4725         "=S" (dummy_value_S),   // 1
4726         "=D" (dummy_value_D)    // 2
4728       : "0" (len),              // edx    // input regs
4729         "1" (prev_row),         // esi
4730         "2" (row)               // edi
4732       : "%eax", "%ebx", "%ecx"            // clobber list (no input regs!)
4734 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4735       , "%mm0", "%mm1", "%mm2", "%mm3"
4736       , "%mm4", "%mm5", "%mm6", "%mm7"
4737 #endif
4738    );
4740 } // end of png_read_filter_row_mmx_up()
4745 //===========================================================================//
4746 //                                                                           //
4747 //                   P N G _ R E A D _ F I L T E R _ R O W                   //
4748 //                                                                           //
4749 //===========================================================================//
4751 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
4753 // Optimized png_read_filter_row routines
4755 void /* PRIVATE */
4756 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
4757    row, png_bytep prev_row, int filter)
4759 #ifdef PNG_DEBUG
4760    char filnm[10];
4761 #endif
4763 /* GRR:  these are superseded by png_ptr->asm_flags: */
4764 #define UseMMX_sub    1   // GRR:  converted 20000730
4765 #define UseMMX_up     1   // GRR:  converted 20000729
4766 #define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
4767 #define UseMMX_paeth  1   // GRR:  converted 20000828
4769    if (_mmx_supported == 2) {
4770        png_mmx_support();
4771    }
4773 #ifdef PNG_DEBUG
4774    png_debug(1, "in png_read_filter_row\n");
4775    switch (filter)
4776    {
4777       case 0: sprintf(filnm, "none");
4778          break;
4779       case 1: sprintf(filnm, "sub-%s", "MMX");
4780          break;
4781       case 2: sprintf(filnm, "up-%s", "MMX");
4782          break;
4783       case 3: sprintf(filnm, "avg-%s", "MMX");
4784          break;
4785       case 4: sprintf(filnm, "Paeth-%s", "MMX");
4786          break;
4787       default: sprintf(filnm, "unknw");
4788          break;
4789    }
4790    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
4791    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
4792    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
4793       (int)((row_info->pixel_depth + 7) >> 3));
4794    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
4795 #endif /* PNG_DEBUG */
4797    switch (filter)
4798    {
4799       case PNG_FILTER_VALUE_NONE:
4800          break;
4802       case PNG_FILTER_VALUE_SUB:
4803          if (
4804              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4805              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4806          {
4807             png_read_filter_row_mmx_sub(row_info, row);
4808          }
4809          else
4810          {
4811             png_uint_32 i;
4812             png_uint_32 istop = row_info->rowbytes;
4813             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4814             png_bytep rp = row + bpp;
4815             png_bytep lp = row;
4817             for (i = bpp; i < istop; i++)
4818             {
4819                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
4820                rp++;
4821             }
4822          }  //end !UseMMX_sub
4823          break;
4825       case PNG_FILTER_VALUE_UP:
4826          if (
4827              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4828              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4829          {
4830             png_read_filter_row_mmx_up(row_info, row, prev_row);
4831          }
4832          else
4833          {
4834             png_uint_32 i;
4835             png_uint_32 istop = row_info->rowbytes;
4836             png_bytep rp = row;
4837             png_bytep pp = prev_row;
4839             for (i = 0; i < istop; ++i)
4840             {
4841                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4842                rp++;
4843             }
4844          }  //end !UseMMX_up
4845          break;
4847       case PNG_FILTER_VALUE_AVG:
4848          if (
4849              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4850              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4851          {
4852             png_read_filter_row_mmx_avg(row_info, row, prev_row);
4853          }
4854          else
4855          {
4856             png_uint_32 i;
4857             png_bytep rp = row;
4858             png_bytep pp = prev_row;
4859             png_bytep lp = row;
4860             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4861             png_uint_32 istop = row_info->rowbytes - bpp;
4863             for (i = 0; i < bpp; i++)
4864             {
4865                *rp = (png_byte)(((int)(*rp) +
4866                   ((int)(*pp++) >> 1)) & 0xff);
4867                rp++;
4868             }
4870             for (i = 0; i < istop; i++)
4871             {
4872                *rp = (png_byte)(((int)(*rp) +
4873                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
4874                rp++;
4875             }
4876          }  //end !UseMMX_avg
4877          break;
4879       case PNG_FILTER_VALUE_PAETH:
4880          if (
4881              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4882              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4883          {
4884             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
4885          }
4886          else
4887          {
4888             png_uint_32 i;
4889             png_bytep rp = row;
4890             png_bytep pp = prev_row;
4891             png_bytep lp = row;
4892             png_bytep cp = prev_row;
4893             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4894             png_uint_32 istop = row_info->rowbytes - bpp;
4896             for (i = 0; i < bpp; i++)
4897             {
4898                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4899                rp++;
4900             }
4902             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
4903             {
4904                int a, b, c, pa, pb, pc, p;
4906                a = *lp++;
4907                b = *pp++;
4908                c = *cp++;
4910                p = b - c;
4911                pc = a - c;
4913 #ifdef PNG_USE_ABS
4914                pa = abs(p);
4915                pb = abs(pc);
4916                pc = abs(p + pc);
4917 #else
4918                pa = p < 0 ? -p : p;
4919                pb = pc < 0 ? -pc : pc;
4920                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
4921 #endif
4923                /*
4924                   if (pa <= pb && pa <= pc)
4925                      p = a;
4926                   else if (pb <= pc)
4927                      p = b;
4928                   else
4929                      p = c;
4930                 */
4932                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
4934                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
4935                rp++;
4936             }
4937          }  //end !UseMMX_paeth
4938          break;
4940       default:
4941          png_warning(png_ptr, "Ignoring bad row-filter type");
4942          *row=0;
4943          break;
4944    }
4947 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
4952 //===========================================================================//
4953 //                                                                           //
4954 //                      P N G _ M M X _ S U P P O R T                        //
4955 //                                                                           //
4956 //===========================================================================//
4958 // GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
4959 //             (2) all instructions compile with gcc 2.7.2.3 and later
4960 //             (3) the function is moved down here to prevent gcc from
4961 //                  inlining it in multiple places and then barfing be-
4962 //                  cause the ".NOT_SUPPORTED" label is multiply defined
4963 //             [is there a way to signal that a *single* function should
4964 //              not be inlined?  is there a way to modify the label for
4965 //              each inlined instance, e.g., by appending _1, _2, etc.?
4966 //              maybe if don't use leading "." in label name? (nope...sigh)]
4968 // GRR TO DO:  make sure PNGAPI doesn't do/require anything screwy here
4969 //             [looks OK for everybody except possibly Cygwin (__cdecl)]
4971 int PNGAPI
4972 png_mmx_support(void)
4974     __asm__ __volatile__ (
4975         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
4976         "pushl %%ecx          \n\t"  // so does ecx...
4977         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
4978 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
4979 //      "pushf                \n\t"  // 16-bit pushf
4980         "pushfl               \n\t"  // save Eflag to stack
4981         "popl %%eax           \n\t"  // get Eflag from stack into eax
4982         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
4983         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
4984         "pushl %%eax          \n\t"  // save modified Eflag back to stack
4985 //      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
4986 //      "popf                 \n\t"  // 16-bit popf
4987         "popfl                \n\t"  // restore modified value to Eflag reg
4988         "pushfl               \n\t"  // save Eflag to stack
4989         "popl %%eax           \n\t"  // get Eflag from stack
4990         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
4991         "jz .NOT_SUPPORTED    \n\t"  // if same, CPUID instr. is not supported
4993         "xorl %%eax, %%eax    \n\t"  // set eax to zero
4994 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
4995         "cpuid                \n\t"  // get the CPU identification info
4996         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
4997         "jl .NOT_SUPPORTED    \n\t"  // if eax is zero, MMX is not supported
4999         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
5000         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
5001                                      // faster than the instruction "mov eax, 1"
5002         "cpuid                \n\t"  // get the CPU identification info again
5003         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5004         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
5005         "jz .NOT_SUPPORTED    \n\t"  // non-zero = yes, MMX IS supported
5007         "movl $1, %%eax       \n\t"  // set return value to 1
5008         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5009         "popl %%edx           \n\t"  // restore edx
5010         "popl %%ecx           \n\t"  // restore ecx
5011         "popl %%ebx           \n\t"  // restore ebx ("row" in png_do_interlace)
5012         "ret                  \n\t"  // DONE:  have MMX support
5014     ".NOT_SUPPORTED:          \n\t"  // target label for jump instructions
5015         "movl $0, %%eax       \n\t"  // set return value to 0
5016         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5017         "popl %%edx           \n\t"  // restore edx
5018         "popl %%ecx           \n\t"  // restore ecx
5019         "popl %%ebx           \n\t"  // restore ebx ("row" in png_do_interlace)
5020 //      "ret                  \n\t"  // DONE:  no MMX support
5021                                      // (fall through to standard C "ret")
5023         :                            // output list (none)
5025         :                            // any variables used on input (none)
5027         : "%eax"                     // clobber list
5028 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
5029 //      , "memory"   // if write to a variable gcc thought was in a reg
5030 //      , "cc"       // "condition codes" (flag bits)
5031     );
5033     // return %%eax;
5036 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */