Code

The BIG graph update
[rrdtool-all.git] / program / libraries / libpng-1.2.0 / pnggccrd.c
1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2  *
3  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4  *
5  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7  *     for Intel's performance analysis of the MMX vs. non-MMX code.
8  *
9  * libpng version 1.2.0 - September 1, 2001
10  * For conditions of distribution and use, see copyright notice in png.h
11  * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12  * Copyright (c) 1998, Intel Corporation
13  *
14  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15  * Interface to libpng contributed by Gilles Vollant, 1999.
16  * GNU C port by Greg Roelofs, 1999-2001.
17  *
18  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19  *
20  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21  *
22  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
23  *
24  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25  *        is required to assemble the newer MMX instructions such as movq.
26  *        For djgpp, see
27  *
28  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29  *
30  *        (or a later version in the same directory).  For Linux, check your
31  *        distribution's web site(s) or try these links:
32  *
33  *           http://rufus.w3.org/linux/RPM/binutils.html
34  *           http://www.debian.org/Packages/stable/devel/binutils.html
35  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36  *             binutils.tgz
37  *
38  *        For other platforms, see the main GNU site:
39  *
40  *           ftp://ftp.gnu.org/pub/gnu/binutils/
41  *
42  *        Version 2.5.2l.15 is definitely too old...
43  */
45 /*
46  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47  * =====================================
48  *
49  * 19991006:
50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51  *
52  * 19991007:
53  *  - additional optimizations (possible or definite):
54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57  *        why subtract 8 from width_mmx in the pass 4/5 case?
58  *        (only width_mmx case) (near line 1606)
59  *     x [DONE] replace pixel_bytes within each block with the true
60  *        constant value (or are compilers smart enough to do that?)
61  *     - rewrite all MMX interlacing code so it's aligned with
62  *        the *beginning* of the row buffer, not the end.  This
63  *        would not only allow one to eliminate half of the memory
64  *        writes for odd passes (that is, pass == odd), it may also
65  *        eliminate some unaligned-data-access exceptions (assuming
66  *        there's a penalty for not aligning 64-bit accesses on
67  *        64-bit boundaries).  The only catch is that the "leftover"
68  *        pixel(s) at the end of the row would have to be saved,
69  *        but there are enough unused MMX registers in every case,
70  *        so this is not a problem.  A further benefit is that the
71  *        post-MMX cleanup code (C code) in at least some of the
72  *        cases could be done within the assembler block.
73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74  *     inconsistent, and don't match the MMX Programmer's Reference
75  *     Manual conventions anyway.  They should be changed to
76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77  *     was lowest in memory (e.g., corresponding to a left pixel)
78  *     and b7 is the byte that was highest (e.g., a right pixel).
79  *
80  * 19991016:
81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82  *     want globals prefixed by underscores when referencing them--
83  *     i.e., if the variable is const4, then refer to it as const4,
84  *     not _const4.  This seems to be a djgpp-specific requirement.
85  *     Also, such variables apparently *must* be declared outside
86  *     of functions; neither static nor automatic variables work if
87  *     defined within the scope of a single function, but both
88  *     static and truly global (multi-module) variables work fine.
89  *
90  * 19991023:
91  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92  *  - switched from string-concatenation-with-macros to cleaner method of
93  *     renaming global variables for djgpp--i.e., always use prefixes in
94  *     inlined assembler code (== strings) and conditionally rename the
95  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
96  *
97  * 19991024:
98  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
99  *     This one was severely weird:  even though mmxsupport() doesn't touch
100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
101  *     the register (even in static/non-fPIC code--see below), which in turn
102  *     caused png_do_read_interlace() to return prematurely on the first row of
103  *     interlaced images (i.e., without expanding the interlaced pixels).
104  *     Inspection of the generated assembly code didn't turn up any clues,
105  *     although it did point at a minor optimization (i.e., get rid of
106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
107  *     instruction is more destructive than it looks?  (Not yet checked.)
108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109  *     listings...  Apparently register spillage has to do with ebx, since
110  *     it's used to index the global offset table.  Commenting it out of the
111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
113  *
114  * 19991107:
115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
117  *
118  * 19991120:
119  *  - made "diff" variable (now "_dif") global to simplify conversion of
120  *     filtering routines (running out of regs, sigh).  "diff" is still used
121  *     in interlacing routines, however.
122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123  *     macro determines which is used); original not yet tested.
124  *
125  * 20000213:
126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
127  *
128  * 20000319:
129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130  *     pass == 4 or 5, that caused visible corruption of interlaced images
131  *
132  * 20000623:
133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138  *     for the original (anonymous) SourceForge bug report.
139  *
140  * 20000706:
141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142  *       pnggccrd.c: In function `png_combine_row':
143  *       pnggccrd.c:525: more than 10 operands in `asm'
144  *       pnggccrd.c:669: more than 10 operands in `asm'
145  *       pnggccrd.c:828: more than 10 operands in `asm'
146  *       pnggccrd.c:994: more than 10 operands in `asm'
147  *       pnggccrd.c:1177: more than 10 operands in `asm'
148  *     They are all the same problem and can be worked around by using the
149  *     global _unmask variable unconditionally, not just in the -fPIC case.
150  *     Reportedly earlier versions of gcc also have the problem with more than
151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
152  *
153  * 20000729:
154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
156  *  - to finish remaining sections:
157  *     - clean up indentation and comments
158  *     - preload local variables
159  *     - add output and input regs (order of former determines numerical
160  *        mapping of latter)
161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
163  *
164  * 20000731:
165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166  *
167  * 20000822:
168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169  *     shared-library (-fPIC) version!  Code works just fine as part of static
170  *     library.  Damn damn damn damn damn, should have tested that sooner.
171  *     ebx is getting clobbered again (explicitly this time); need to save it
172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
173  *
174  * 20000823:
175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
177  *     and *Mask* globals and got rid of leading "$" signs.
178  *
179  * 20000826:
180  *  - added visual separators to help navigate microscopic printed copies
181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182  *     on png_read_filter_row_mmx_avg()
183  *
184  * 20000828:
185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
187  *     cleaned up/shortened in either routine, but functionality is complete
188  *     and seems to be working fine.
189  *
190  * 20000829:
191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
194  *     is simple enough...
195  *
196  * 20000914:
197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
198  *     correctly (but 48-bit RGB just fine)
199  *
200  * 20000916:
201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
205  *
206  * 20010101:
207  *  - added new png_init_mmx_flags() function (here only because it needs to
208  *     call mmxsupport(), which should probably become global png_mmxsupport());
209  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
210  *
211  * 20010103:
212  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
214  *
215  * 20010104:
216  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
217  *     within MMX version of png_read_filter_row()) so no longer necessary to
218  *     compile it into pngrutil.o
219  *
220  * 20010310:
221  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222  *
223  * STILL TO DO:
224  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
225  *     - write MMX code for 48-bit case (pixel_bytes == 6)
226  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
227  *        why subtract 8 from width_mmx in the pass 4/5 case?
228  *        (only width_mmx case) (near line 1606)
229  *     - rewrite all MMX interlacing code so it's aligned with beginning
230  *        of the row buffer, not the end (see 19991007 for details)
231  *     x pick one version of mmxsupport() and get rid of the other
232  *     - add error messages to any remaining bogus default cases
233  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
234  *     x add support for runtime enable/disable/query of various MMX routines
235  */
237 #define PNG_INTERNAL
238 #include "png.h"
240 #if defined(PNG_USE_PNGGCCRD)
242 int PNGAPI png_mmx_support(void);
244 #ifdef PNG_USE_LOCAL_ARRAYS
245 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
246 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
247 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
248 #endif
250 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
251 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
252  * so define them without: */
253 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
254 #  define _mmx_supported  mmx_supported
255 #  define _const4         const4
256 #  define _const6         const6
257 #  define _mask8_0        mask8_0
258 #  define _mask16_1       mask16_1
259 #  define _mask16_0       mask16_0
260 #  define _mask24_2       mask24_2
261 #  define _mask24_1       mask24_1
262 #  define _mask24_0       mask24_0
263 #  define _mask32_3       mask32_3
264 #  define _mask32_2       mask32_2
265 #  define _mask32_1       mask32_1
266 #  define _mask32_0       mask32_0
267 #  define _mask48_5       mask48_5
268 #  define _mask48_4       mask48_4
269 #  define _mask48_3       mask48_3
270 #  define _mask48_2       mask48_2
271 #  define _mask48_1       mask48_1
272 #  define _mask48_0       mask48_0
273 #  define _LBCarryMask    LBCarryMask
274 #  define _HBClearMask    HBClearMask
275 #  define _ActiveMask     ActiveMask
276 #  define _ActiveMask2    ActiveMask2
277 #  define _ActiveMaskEnd  ActiveMaskEnd
278 #  define _ShiftBpp       ShiftBpp
279 #  define _ShiftRem       ShiftRem
280 #ifdef PNG_THREAD_UNSAFE_OK
281 #  define _unmask         unmask
282 #  define _FullLength     FullLength
283 #  define _MMXLength      MMXLength
284 #  define _dif            dif
285 #  define _patemp         patemp
286 #  define _pbtemp         pbtemp
287 #  define _pctemp         pctemp
288 #endif
289 #endif
292 /* These constants are used in the inlined MMX assembly code.
293    Ignore gcc's "At top level: defined but not used" warnings. */
295 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
296  *  since that case uses the %ebx register for indexing the Global Offset Table
297  *  and there were no other registers available.  But gcc 2.95 and later emit
298  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
299  *  in the non-PIC case, so we'll just use the global unconditionally now.
300  */
301 #ifdef PNG_THREAD_UNSAFE_OK
302 static int _unmask;
303 #endif
305 static unsigned long long _mask8_0  = 0x0102040810204080LL;
307 static unsigned long long _mask16_1 = 0x0101020204040808LL;
308 static unsigned long long _mask16_0 = 0x1010202040408080LL;
310 static unsigned long long _mask24_2 = 0x0101010202020404LL;
311 static unsigned long long _mask24_1 = 0x0408080810101020LL;
312 static unsigned long long _mask24_0 = 0x2020404040808080LL;
314 static unsigned long long _mask32_3 = 0x0101010102020202LL;
315 static unsigned long long _mask32_2 = 0x0404040408080808LL;
316 static unsigned long long _mask32_1 = 0x1010101020202020LL;
317 static unsigned long long _mask32_0 = 0x4040404080808080LL;
319 static unsigned long long _mask48_5 = 0x0101010101010202LL;
320 static unsigned long long _mask48_4 = 0x0202020204040404LL;
321 static unsigned long long _mask48_3 = 0x0404080808080808LL;
322 static unsigned long long _mask48_2 = 0x1010101010102020LL;
323 static unsigned long long _mask48_1 = 0x2020202040404040LL;
324 static unsigned long long _mask48_0 = 0x4040808080808080LL;
326 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
327 //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
328 static unsigned long long _const6   = 0x00000000000000FFLL;
330 // These are used in the row-filter routines and should/would be local
331 //  variables if not for gcc addressing limitations.
332 // WARNING: Their presence probably defeats the thread safety of libpng.
334 #ifdef PNG_THREAD_UNSAFE_OK
335 static png_uint_32  _FullLength;
336 static png_uint_32  _MMXLength;
337 static int          _dif;
338 static int          _patemp;    // temp variables for Paeth routine
339 static int          _pbtemp;
340 static int          _pctemp;
341 #endif
343 void /* PRIVATE */
344 png_squelch_warnings(void)
346 #ifdef PNG_THREAD_UNSAFE_OK
347    _dif = _dif;
348    _patemp = _patemp;
349    _pbtemp = _pbtemp;
350    _pctemp = _pctemp;
351    _MMXLength = _MMXLength;
352 #endif
353    _const4  = _const4;
354    _const6  = _const6;
355    _mask8_0  = _mask8_0;
356    _mask16_1 = _mask16_1;
357    _mask16_0 = _mask16_0;
358    _mask24_2 = _mask24_2;
359    _mask24_1 = _mask24_1;
360    _mask24_0 = _mask24_0;
361    _mask32_3 = _mask32_3;
362    _mask32_2 = _mask32_2;
363    _mask32_1 = _mask32_1;
364    _mask32_0 = _mask32_0;
365    _mask48_5 = _mask48_5;
366    _mask48_4 = _mask48_4;
367    _mask48_3 = _mask48_3;
368    _mask48_2 = _mask48_2;
369    _mask48_1 = _mask48_1;
370    _mask48_0 = _mask48_0;
372 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
375 static int _mmx_supported = 2;
377 /*===========================================================================*/
378 /*                                                                           */
379 /*                       P N G _ C O M B I N E _ R O W                       */
380 /*                                                                           */
381 /*===========================================================================*/
383 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
385 #define BPP2  2
386 #define BPP3  3         /* bytes per pixel (a.k.a. pixel_bytes) */
387 #define BPP4  4
388 #define BPP6  6         /* (defined only to help avoid cut-and-paste errors) */
389 #define BPP8  8
391 /* Combines the row recently read in with the previous row.
392    This routine takes care of alpha and transparency if requested.
393    This routine also handles the two methods of progressive display
394    of interlaced images, depending on the mask value.
395    The mask value describes which pixels are to be combined with
396    the row.  The pattern always repeats every 8 pixels, so just 8
397    bits are needed.  A one indicates the pixel is to be combined; a
398    zero indicates the pixel is to be skipped.  This is in addition
399    to any alpha or transparency value associated with the pixel.
400    If you want all pixels to be combined, pass 0xff (255) in mask. */
402 /* Use this routine for the x86 platform - it uses a faster MMX routine
403    if the machine supports MMX. */
405 void /* PRIVATE */
406 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
408    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
410 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
411    if (_mmx_supported == 2) {
412        /* this should have happened in png_init_mmx_flags() already */
413        png_warning(png_ptr, "asm_flags may not have been initialized");
414        png_mmx_support();
415    }
416 #endif
418    if (mask == 0xff)
419    {
420       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
421       png_memcpy(row, png_ptr->row_buf + 1,
422        (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
423    }
424    else   /* (png_combine_row() is never called with mask == 0) */
425    {
426       switch (png_ptr->row_info.pixel_depth)
427       {
428          case 1:        /* png_ptr->row_info.pixel_depth */
429          {
430             png_bytep sp;
431             png_bytep dp;
432             int s_inc, s_start, s_end;
433             int m;
434             int shift;
435             png_uint_32 i;
437             sp = png_ptr->row_buf + 1;
438             dp = row;
439             m = 0x80;
440 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
441             if (png_ptr->transformations & PNG_PACKSWAP)
442             {
443                 s_start = 0;
444                 s_end = 7;
445                 s_inc = 1;
446             }
447             else
448 #endif
449             {
450                 s_start = 7;
451                 s_end = 0;
452                 s_inc = -1;
453             }
455             shift = s_start;
457             for (i = 0; i < png_ptr->width; i++)
458             {
459                if (m & mask)
460                {
461                   int value;
463                   value = (*sp >> shift) & 0x1;
464                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
465                   *dp |= (png_byte)(value << shift);
466                }
468                if (shift == s_end)
469                {
470                   shift = s_start;
471                   sp++;
472                   dp++;
473                }
474                else
475                   shift += s_inc;
477                if (m == 1)
478                   m = 0x80;
479                else
480                   m >>= 1;
481             }
482             break;
483          }
485          case 2:        /* png_ptr->row_info.pixel_depth */
486          {
487             png_bytep sp;
488             png_bytep dp;
489             int s_start, s_end, s_inc;
490             int m;
491             int shift;
492             png_uint_32 i;
493             int value;
495             sp = png_ptr->row_buf + 1;
496             dp = row;
497             m = 0x80;
498 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
499             if (png_ptr->transformations & PNG_PACKSWAP)
500             {
501                s_start = 0;
502                s_end = 6;
503                s_inc = 2;
504             }
505             else
506 #endif
507             {
508                s_start = 6;
509                s_end = 0;
510                s_inc = -2;
511             }
513             shift = s_start;
515             for (i = 0; i < png_ptr->width; i++)
516             {
517                if (m & mask)
518                {
519                   value = (*sp >> shift) & 0x3;
520                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
521                   *dp |= (png_byte)(value << shift);
522                }
524                if (shift == s_end)
525                {
526                   shift = s_start;
527                   sp++;
528                   dp++;
529                }
530                else
531                   shift += s_inc;
532                if (m == 1)
533                   m = 0x80;
534                else
535                   m >>= 1;
536             }
537             break;
538          }
540          case 4:        /* png_ptr->row_info.pixel_depth */
541          {
542             png_bytep sp;
543             png_bytep dp;
544             int s_start, s_end, s_inc;
545             int m;
546             int shift;
547             png_uint_32 i;
548             int value;
550             sp = png_ptr->row_buf + 1;
551             dp = row;
552             m = 0x80;
553 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
554             if (png_ptr->transformations & PNG_PACKSWAP)
555             {
556                s_start = 0;
557                s_end = 4;
558                s_inc = 4;
559             }
560             else
561 #endif
562             {
563                s_start = 4;
564                s_end = 0;
565                s_inc = -4;
566             }
567             shift = s_start;
569             for (i = 0; i < png_ptr->width; i++)
570             {
571                if (m & mask)
572                {
573                   value = (*sp >> shift) & 0xf;
574                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
575                   *dp |= (png_byte)(value << shift);
576                }
578                if (shift == s_end)
579                {
580                   shift = s_start;
581                   sp++;
582                   dp++;
583                }
584                else
585                   shift += s_inc;
586                if (m == 1)
587                   m = 0x80;
588                else
589                   m >>= 1;
590             }
591             break;
592          }
594          case 8:        /* png_ptr->row_info.pixel_depth */
595          {
596             png_bytep srcptr;
597             png_bytep dstptr;
599 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
600             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
601                 /* && _mmx_supported */ )
602             {
603                png_uint_32 len;
604                int diff;
605                int dummy_value_a;   // fix 'forbidden register spilled' error
606                int dummy_value_d;
607                int dummy_value_c;
608                int dummy_value_S;
609                int dummy_value_D;
610                _unmask = ~mask;            // global variable for -fPIC version
611                srcptr = png_ptr->row_buf + 1;
612                dstptr = row;
613                len  = png_ptr->width &~7;  // reduce to multiple of 8
614                diff = (int) (png_ptr->width & 7);  // amount lost
616                __asm__ __volatile__ (
617                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
618                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
619                   "punpcklbw %%mm7, %%mm7    \n\t"
620                   "punpcklwd %%mm7, %%mm7    \n\t"
621                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
623                   "movq      _mask8_0, %%mm0 \n\t"
624                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
625                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
627 // preload        "movl      len, %%ecx      \n\t" // load length of line
628 // preload        "movl      srcptr, %%esi   \n\t" // load source
629 // preload        "movl      dstptr, %%edi   \n\t" // load dest
631                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
632                   "je        mainloop8end    \n\t"
634                 "mainloop8:                  \n\t"
635                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
636                   "pand      %%mm0, %%mm4    \n\t"
637                   "movq      %%mm0, %%mm6    \n\t"
638                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
639                   "por       %%mm6, %%mm4    \n\t"
640                   "movq      %%mm4, (%%edi)  \n\t"
641                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
642                   "addl      $8, %%edi       \n\t"
643                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
644                   "ja        mainloop8       \n\t"
646                 "mainloop8end:               \n\t"
647 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
648                   "movl      %%eax, %%ecx    \n\t"
649                   "cmpl      $0, %%ecx       \n\t"
650                   "jz        end8            \n\t"
651 // preload        "movl      mask, %%edx     \n\t"
652                   "sall      $24, %%edx      \n\t" // make low byte, high byte
654                 "secondloop8:                \n\t"
655                   "sall      %%edx           \n\t" // move high bit to CF
656                   "jnc       skip8           \n\t" // if CF = 0
657                   "movb      (%%esi), %%al   \n\t"
658                   "movb      %%al, (%%edi)   \n\t"
660                 "skip8:                      \n\t"
661                   "incl      %%esi           \n\t"
662                   "incl      %%edi           \n\t"
663                   "decl      %%ecx           \n\t"
664                   "jnz       secondloop8     \n\t"
666                 "end8:                       \n\t"
667                   "EMMS                      \n\t"  // DONE
669                   : "=a" (dummy_value_a),           // output regs (dummy)
670                     "=d" (dummy_value_d),
671                     "=c" (dummy_value_c),
672                     "=S" (dummy_value_S),
673                     "=D" (dummy_value_D)
675                   : "3" (srcptr),      // esi       // input regs
676                     "4" (dstptr),      // edi
677                     "0" (diff),        // eax
678 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
679                     "2" (len),         // ecx
680                     "1" (mask)         // edx
682 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
683                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
684 #endif
685                );
686             }
687             else /* mmx _not supported - Use modified C routine */
688 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
689             {
690                register png_uint_32 i;
691                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
692                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
693                register int stride = png_pass_inc[png_ptr->pass];
694                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
695                register int rep_bytes = png_pass_width[png_ptr->pass];
696                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
697                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
698                int diff = (int) (png_ptr->width & 7); /* amount lost */
699                register png_uint_32 final_val = len;  /* GRR bugfix */
701                srcptr = png_ptr->row_buf + 1 + initial_val;
702                dstptr = row + initial_val;
704                for (i = initial_val; i < final_val; i += stride)
705                {
706                   png_memcpy(dstptr, srcptr, rep_bytes);
707                   srcptr += stride;
708                   dstptr += stride;
709                }
710                if (diff)  /* number of leftover pixels:  3 for pngtest */
711                {
712                   final_val+=diff /* *BPP1 */ ;
713                   for (; i < final_val; i += stride)
714                   {
715                      if (rep_bytes > (int)(final_val-i))
716                         rep_bytes = (int)(final_val-i);
717                      png_memcpy(dstptr, srcptr, rep_bytes);
718                      srcptr += stride;
719                      dstptr += stride;
720                   }
721                }
723             } /* end of else (_mmx_supported) */
725             break;
726          }       /* end 8 bpp */
728          case 16:       /* png_ptr->row_info.pixel_depth */
729          {
730             png_bytep srcptr;
731             png_bytep dstptr;
733 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
734             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
735                 /* && _mmx_supported */ )
736             {
737                png_uint_32 len;
738                int diff;
739                int dummy_value_a;   // fix 'forbidden register spilled' error
740                int dummy_value_d;
741                int dummy_value_c;
742                int dummy_value_S;
743                int dummy_value_D;
744                _unmask = ~mask;            // global variable for -fPIC version
745                srcptr = png_ptr->row_buf + 1;
746                dstptr = row;
747                len  = png_ptr->width &~7;  // reduce to multiple of 8
748                diff = (int) (png_ptr->width & 7); // amount lost //
750                __asm__ __volatile__ (
751                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
752                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
753                   "punpcklbw %%mm7, %%mm7     \n\t"
754                   "punpcklwd %%mm7, %%mm7     \n\t"
755                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
757                   "movq      _mask16_0, %%mm0 \n\t"
758                   "movq      _mask16_1, %%mm1 \n\t"
760                   "pand      %%mm7, %%mm0     \n\t"
761                   "pand      %%mm7, %%mm1     \n\t"
763                   "pcmpeqb   %%mm6, %%mm0     \n\t"
764                   "pcmpeqb   %%mm6, %%mm1     \n\t"
766 // preload        "movl      len, %%ecx       \n\t" // load length of line
767 // preload        "movl      srcptr, %%esi    \n\t" // load source
768 // preload        "movl      dstptr, %%edi    \n\t" // load dest
770                   "cmpl      $0, %%ecx        \n\t"
771                   "jz        mainloop16end    \n\t"
773                 "mainloop16:                  \n\t"
774                   "movq      (%%esi), %%mm4   \n\t"
775                   "pand      %%mm0, %%mm4     \n\t"
776                   "movq      %%mm0, %%mm6     \n\t"
777                   "movq      (%%edi), %%mm7   \n\t"
778                   "pandn     %%mm7, %%mm6     \n\t"
779                   "por       %%mm6, %%mm4     \n\t"
780                   "movq      %%mm4, (%%edi)   \n\t"
782                   "movq      8(%%esi), %%mm5  \n\t"
783                   "pand      %%mm1, %%mm5     \n\t"
784                   "movq      %%mm1, %%mm7     \n\t"
785                   "movq      8(%%edi), %%mm6  \n\t"
786                   "pandn     %%mm6, %%mm7     \n\t"
787                   "por       %%mm7, %%mm5     \n\t"
788                   "movq      %%mm5, 8(%%edi)  \n\t"
790                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
791                   "addl      $16, %%edi       \n\t"
792                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
793                   "ja        mainloop16       \n\t"
795                 "mainloop16end:               \n\t"
796 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
797                   "movl      %%eax, %%ecx     \n\t"
798                   "cmpl      $0, %%ecx        \n\t"
799                   "jz        end16            \n\t"
800 // preload        "movl      mask, %%edx      \n\t"
801                   "sall      $24, %%edx       \n\t" // make low byte, high byte
803                 "secondloop16:                \n\t"
804                   "sall      %%edx            \n\t" // move high bit to CF
805                   "jnc       skip16           \n\t" // if CF = 0
806                   "movw      (%%esi), %%ax    \n\t"
807                   "movw      %%ax, (%%edi)    \n\t"
809                 "skip16:                      \n\t"
810                   "addl      $2, %%esi        \n\t"
811                   "addl      $2, %%edi        \n\t"
812                   "decl      %%ecx            \n\t"
813                   "jnz       secondloop16     \n\t"
815                 "end16:                       \n\t"
816                   "EMMS                       \n\t" // DONE
818                   : "=a" (dummy_value_a),           // output regs (dummy)
819                     "=c" (dummy_value_c),
820                     "=d" (dummy_value_d),
821                     "=S" (dummy_value_S),
822                     "=D" (dummy_value_D)
824                   : "0" (diff),        // eax       // input regs
825 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
826                     "1" (len),         // ecx
827                     "2" (mask),        // edx
828                     "3" (srcptr),      // esi
829                     "4" (dstptr)       // edi
831 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
832                   : "%mm0", "%mm1", "%mm4"          // clobber list
833                   , "%mm5", "%mm6", "%mm7"
834 #endif
835                );
836             }
837             else /* mmx _not supported - Use modified C routine */
838 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
839             {
840                register png_uint_32 i;
841                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
842                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
843                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
844                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
845                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
846                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
847                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
848                int diff = (int) (png_ptr->width & 7); /* amount lost */
849                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
851                srcptr = png_ptr->row_buf + 1 + initial_val;
852                dstptr = row + initial_val;
854                for (i = initial_val; i < final_val; i += stride)
855                {
856                   png_memcpy(dstptr, srcptr, rep_bytes);
857                   srcptr += stride;
858                   dstptr += stride;
859                }
860                if (diff)  /* number of leftover pixels:  3 for pngtest */
861                {
862                   final_val+=diff*BPP2;
863                   for (; i < final_val; i += stride)
864                   {
865                      if (rep_bytes > (int)(final_val-i))
866                         rep_bytes = (int)(final_val-i);
867                      png_memcpy(dstptr, srcptr, rep_bytes);
868                      srcptr += stride;
869                      dstptr += stride;
870                   }
871                }
872             } /* end of else (_mmx_supported) */
874             break;
875          }       /* end 16 bpp */
877          case 24:       /* png_ptr->row_info.pixel_depth */
878          {
879             png_bytep srcptr;
880             png_bytep dstptr;
882 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
883             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
884                 /* && _mmx_supported */ )
885             {
886                png_uint_32 len;
887                int diff;
888                int dummy_value_a;   // fix 'forbidden register spilled' error
889                int dummy_value_d;
890                int dummy_value_c;
891                int dummy_value_S;
892                int dummy_value_D;
893                _unmask = ~mask;            // global variable for -fPIC version
894                srcptr = png_ptr->row_buf + 1;
895                dstptr = row;
896                len  = png_ptr->width &~7;  // reduce to multiple of 8
897                diff = (int) (png_ptr->width & 7); // amount lost //
899                __asm__ __volatile__ (
900                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
901                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
902                   "punpcklbw %%mm7, %%mm7     \n\t"
903                   "punpcklwd %%mm7, %%mm7     \n\t"
904                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
906                   "movq      _mask24_0, %%mm0 \n\t"
907                   "movq      _mask24_1, %%mm1 \n\t"
908                   "movq      _mask24_2, %%mm2 \n\t"
910                   "pand      %%mm7, %%mm0     \n\t"
911                   "pand      %%mm7, %%mm1     \n\t"
912                   "pand      %%mm7, %%mm2     \n\t"
914                   "pcmpeqb   %%mm6, %%mm0     \n\t"
915                   "pcmpeqb   %%mm6, %%mm1     \n\t"
916                   "pcmpeqb   %%mm6, %%mm2     \n\t"
918 // preload        "movl      len, %%ecx       \n\t" // load length of line
919 // preload        "movl      srcptr, %%esi    \n\t" // load source
920 // preload        "movl      dstptr, %%edi    \n\t" // load dest
922                   "cmpl      $0, %%ecx        \n\t"
923                   "jz        mainloop24end    \n\t"
925                 "mainloop24:                  \n\t"
926                   "movq      (%%esi), %%mm4   \n\t"
927                   "pand      %%mm0, %%mm4     \n\t"
928                   "movq      %%mm0, %%mm6     \n\t"
929                   "movq      (%%edi), %%mm7   \n\t"
930                   "pandn     %%mm7, %%mm6     \n\t"
931                   "por       %%mm6, %%mm4     \n\t"
932                   "movq      %%mm4, (%%edi)   \n\t"
934                   "movq      8(%%esi), %%mm5  \n\t"
935                   "pand      %%mm1, %%mm5     \n\t"
936                   "movq      %%mm1, %%mm7     \n\t"
937                   "movq      8(%%edi), %%mm6  \n\t"
938                   "pandn     %%mm6, %%mm7     \n\t"
939                   "por       %%mm7, %%mm5     \n\t"
940                   "movq      %%mm5, 8(%%edi)  \n\t"
942                   "movq      16(%%esi), %%mm6 \n\t"
943                   "pand      %%mm2, %%mm6     \n\t"
944                   "movq      %%mm2, %%mm4     \n\t"
945                   "movq      16(%%edi), %%mm7 \n\t"
946                   "pandn     %%mm7, %%mm4     \n\t"
947                   "por       %%mm4, %%mm6     \n\t"
948                   "movq      %%mm6, 16(%%edi) \n\t"
950                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
951                   "addl      $24, %%edi       \n\t"
952                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
954                   "ja        mainloop24       \n\t"
956                 "mainloop24end:               \n\t"
957 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
958                   "movl      %%eax, %%ecx     \n\t"
959                   "cmpl      $0, %%ecx        \n\t"
960                   "jz        end24            \n\t"
961 // preload        "movl      mask, %%edx      \n\t"
962                   "sall      $24, %%edx       \n\t" // make low byte, high byte
964                 "secondloop24:                \n\t"
965                   "sall      %%edx            \n\t" // move high bit to CF
966                   "jnc       skip24           \n\t" // if CF = 0
967                   "movw      (%%esi), %%ax    \n\t"
968                   "movw      %%ax, (%%edi)    \n\t"
969                   "xorl      %%eax, %%eax     \n\t"
970                   "movb      2(%%esi), %%al   \n\t"
971                   "movb      %%al, 2(%%edi)   \n\t"
973                 "skip24:                      \n\t"
974                   "addl      $3, %%esi        \n\t"
975                   "addl      $3, %%edi        \n\t"
976                   "decl      %%ecx            \n\t"
977                   "jnz       secondloop24     \n\t"
979                 "end24:                       \n\t"
980                   "EMMS                       \n\t" // DONE
982                   : "=a" (dummy_value_a),           // output regs (dummy)
983                     "=d" (dummy_value_d),
984                     "=c" (dummy_value_c),
985                     "=S" (dummy_value_S),
986                     "=D" (dummy_value_D)
988                   : "3" (srcptr),      // esi       // input regs
989                     "4" (dstptr),      // edi
990                     "0" (diff),        // eax
991 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
992                     "2" (len),         // ecx
993                     "1" (mask)         // edx
995 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
996                   : "%mm0", "%mm1", "%mm2"          // clobber list
997                   , "%mm4", "%mm5", "%mm6", "%mm7"
998 #endif
999                );
1000             }
1001             else /* mmx _not supported - Use modified C routine */
1002 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1003             {
1004                register png_uint_32 i;
1005                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1006                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1007                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1008                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1009                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1010                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1011                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1012                int diff = (int) (png_ptr->width & 7); /* amount lost */
1013                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1015                srcptr = png_ptr->row_buf + 1 + initial_val;
1016                dstptr = row + initial_val;
1018                for (i = initial_val; i < final_val; i += stride)
1019                {
1020                   png_memcpy(dstptr, srcptr, rep_bytes);
1021                   srcptr += stride;
1022                   dstptr += stride;
1023                }
1024                if (diff)  /* number of leftover pixels:  3 for pngtest */
1025                {
1026                   final_val+=diff*BPP3;
1027                   for (; i < final_val; i += stride)
1028                   {
1029                      if (rep_bytes > (int)(final_val-i))
1030                         rep_bytes = (int)(final_val-i);
1031                      png_memcpy(dstptr, srcptr, rep_bytes);
1032                      srcptr += stride;
1033                      dstptr += stride;
1034                   }
1035                }
1036             } /* end of else (_mmx_supported) */
1038             break;
1039          }       /* end 24 bpp */
1041          case 32:       /* png_ptr->row_info.pixel_depth */
1042          {
1043             png_bytep srcptr;
1044             png_bytep dstptr;
1046 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1047             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1048                 /* && _mmx_supported */ )
1049             {
1050                png_uint_32 len;
1051                int diff;
1052                int dummy_value_a;   // fix 'forbidden register spilled' error
1053                int dummy_value_d;
1054                int dummy_value_c;
1055                int dummy_value_S;
1056                int dummy_value_D;
1057                _unmask = ~mask;            // global variable for -fPIC version
1058                srcptr = png_ptr->row_buf + 1;
1059                dstptr = row;
1060                len  = png_ptr->width &~7;  // reduce to multiple of 8
1061                diff = (int) (png_ptr->width & 7); // amount lost //
1063                __asm__ __volatile__ (
1064                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1065                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1066                   "punpcklbw %%mm7, %%mm7     \n\t"
1067                   "punpcklwd %%mm7, %%mm7     \n\t"
1068                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1070                   "movq      _mask32_0, %%mm0 \n\t"
1071                   "movq      _mask32_1, %%mm1 \n\t"
1072                   "movq      _mask32_2, %%mm2 \n\t"
1073                   "movq      _mask32_3, %%mm3 \n\t"
1075                   "pand      %%mm7, %%mm0     \n\t"
1076                   "pand      %%mm7, %%mm1     \n\t"
1077                   "pand      %%mm7, %%mm2     \n\t"
1078                   "pand      %%mm7, %%mm3     \n\t"
1080                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1081                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1082                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1083                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1085 // preload        "movl      len, %%ecx       \n\t" // load length of line
1086 // preload        "movl      srcptr, %%esi    \n\t" // load source
1087 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1089                   "cmpl      $0, %%ecx        \n\t" // lcr
1090                   "jz        mainloop32end    \n\t"
1092                 "mainloop32:                  \n\t"
1093                   "movq      (%%esi), %%mm4   \n\t"
1094                   "pand      %%mm0, %%mm4     \n\t"
1095                   "movq      %%mm0, %%mm6     \n\t"
1096                   "movq      (%%edi), %%mm7   \n\t"
1097                   "pandn     %%mm7, %%mm6     \n\t"
1098                   "por       %%mm6, %%mm4     \n\t"
1099                   "movq      %%mm4, (%%edi)   \n\t"
1101                   "movq      8(%%esi), %%mm5  \n\t"
1102                   "pand      %%mm1, %%mm5     \n\t"
1103                   "movq      %%mm1, %%mm7     \n\t"
1104                   "movq      8(%%edi), %%mm6  \n\t"
1105                   "pandn     %%mm6, %%mm7     \n\t"
1106                   "por       %%mm7, %%mm5     \n\t"
1107                   "movq      %%mm5, 8(%%edi)  \n\t"
1109                   "movq      16(%%esi), %%mm6 \n\t"
1110                   "pand      %%mm2, %%mm6     \n\t"
1111                   "movq      %%mm2, %%mm4     \n\t"
1112                   "movq      16(%%edi), %%mm7 \n\t"
1113                   "pandn     %%mm7, %%mm4     \n\t"
1114                   "por       %%mm4, %%mm6     \n\t"
1115                   "movq      %%mm6, 16(%%edi) \n\t"
1117                   "movq      24(%%esi), %%mm7 \n\t"
1118                   "pand      %%mm3, %%mm7     \n\t"
1119                   "movq      %%mm3, %%mm5     \n\t"
1120                   "movq      24(%%edi), %%mm4 \n\t"
1121                   "pandn     %%mm4, %%mm5     \n\t"
1122                   "por       %%mm5, %%mm7     \n\t"
1123                   "movq      %%mm7, 24(%%edi) \n\t"
1125                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
1126                   "addl      $32, %%edi       \n\t"
1127                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1128                   "ja        mainloop32       \n\t"
1130                 "mainloop32end:               \n\t"
1131 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1132                   "movl      %%eax, %%ecx     \n\t"
1133                   "cmpl      $0, %%ecx        \n\t"
1134                   "jz        end32            \n\t"
1135 // preload        "movl      mask, %%edx      \n\t"
1136                   "sall      $24, %%edx       \n\t" // low byte => high byte
1138                 "secondloop32:                \n\t"
1139                   "sall      %%edx            \n\t" // move high bit to CF
1140                   "jnc       skip32           \n\t" // if CF = 0
1141                   "movl      (%%esi), %%eax   \n\t"
1142                   "movl      %%eax, (%%edi)   \n\t"
1144                 "skip32:                      \n\t"
1145                   "addl      $4, %%esi        \n\t"
1146                   "addl      $4, %%edi        \n\t"
1147                   "decl      %%ecx            \n\t"
1148                   "jnz       secondloop32     \n\t"
1150                 "end32:                       \n\t"
1151                   "EMMS                       \n\t" // DONE
1153                   : "=a" (dummy_value_a),           // output regs (dummy)
1154                     "=d" (dummy_value_d),
1155                     "=c" (dummy_value_c),
1156                     "=S" (dummy_value_S),
1157                     "=D" (dummy_value_D)
1159                   : "3" (srcptr),      // esi       // input regs
1160                     "4" (dstptr),      // edi
1161                     "0" (diff),        // eax
1162 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1163                     "2" (len),         // ecx
1164                     "1" (mask)         // edx
1166 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1167                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1168                   , "%mm4", "%mm5", "%mm6", "%mm7"
1169 #endif
1170                );
1171             }
1172             else /* mmx _not supported - Use modified C routine */
1173 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1174             {
1175                register png_uint_32 i;
1176                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1177                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1178                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1179                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1180                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1181                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1182                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1183                int diff = (int) (png_ptr->width & 7); /* amount lost */
1184                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1186                srcptr = png_ptr->row_buf + 1 + initial_val;
1187                dstptr = row + initial_val;
1189                for (i = initial_val; i < final_val; i += stride)
1190                {
1191                   png_memcpy(dstptr, srcptr, rep_bytes);
1192                   srcptr += stride;
1193                   dstptr += stride;
1194                }
1195                if (diff)  /* number of leftover pixels:  3 for pngtest */
1196                {
1197                   final_val+=diff*BPP4;
1198                   for (; i < final_val; i += stride)
1199                   {
1200                      if (rep_bytes > (int)(final_val-i))
1201                         rep_bytes = (int)(final_val-i);
1202                      png_memcpy(dstptr, srcptr, rep_bytes);
1203                      srcptr += stride;
1204                      dstptr += stride;
1205                   }
1206                }
1207             } /* end of else (_mmx_supported) */
1209             break;
1210          }       /* end 32 bpp */
1212          case 48:       /* png_ptr->row_info.pixel_depth */
1213          {
1214             png_bytep srcptr;
1215             png_bytep dstptr;
1217 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1218             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1219                 /* && _mmx_supported */ )
1220             {
1221                png_uint_32 len;
1222                int diff;
1223                int dummy_value_a;   // fix 'forbidden register spilled' error
1224                int dummy_value_d;
1225                int dummy_value_c;
1226                int dummy_value_S;
1227                int dummy_value_D;
1228                _unmask = ~mask;            // global variable for -fPIC version
1229                srcptr = png_ptr->row_buf + 1;
1230                dstptr = row;
1231                len  = png_ptr->width &~7;  // reduce to multiple of 8
1232                diff = (int) (png_ptr->width & 7); // amount lost //
1234                __asm__ __volatile__ (
1235                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1236                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1237                   "punpcklbw %%mm7, %%mm7     \n\t"
1238                   "punpcklwd %%mm7, %%mm7     \n\t"
1239                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1241                   "movq      _mask48_0, %%mm0 \n\t"
1242                   "movq      _mask48_1, %%mm1 \n\t"
1243                   "movq      _mask48_2, %%mm2 \n\t"
1244                   "movq      _mask48_3, %%mm3 \n\t"
1245                   "movq      _mask48_4, %%mm4 \n\t"
1246                   "movq      _mask48_5, %%mm5 \n\t"
1248                   "pand      %%mm7, %%mm0     \n\t"
1249                   "pand      %%mm7, %%mm1     \n\t"
1250                   "pand      %%mm7, %%mm2     \n\t"
1251                   "pand      %%mm7, %%mm3     \n\t"
1252                   "pand      %%mm7, %%mm4     \n\t"
1253                   "pand      %%mm7, %%mm5     \n\t"
1255                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1256                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1257                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1258                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1259                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1260                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1262 // preload        "movl      len, %%ecx       \n\t" // load length of line
1263 // preload        "movl      srcptr, %%esi    \n\t" // load source
1264 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1266                   "cmpl      $0, %%ecx        \n\t"
1267                   "jz        mainloop48end    \n\t"
1269                 "mainloop48:                  \n\t"
1270                   "movq      (%%esi), %%mm7   \n\t"
1271                   "pand      %%mm0, %%mm7     \n\t"
1272                   "movq      %%mm0, %%mm6     \n\t"
1273                   "pandn     (%%edi), %%mm6   \n\t"
1274                   "por       %%mm6, %%mm7     \n\t"
1275                   "movq      %%mm7, (%%edi)   \n\t"
1277                   "movq      8(%%esi), %%mm6  \n\t"
1278                   "pand      %%mm1, %%mm6     \n\t"
1279                   "movq      %%mm1, %%mm7     \n\t"
1280                   "pandn     8(%%edi), %%mm7  \n\t"
1281                   "por       %%mm7, %%mm6     \n\t"
1282                   "movq      %%mm6, 8(%%edi)  \n\t"
1284                   "movq      16(%%esi), %%mm6 \n\t"
1285                   "pand      %%mm2, %%mm6     \n\t"
1286                   "movq      %%mm2, %%mm7     \n\t"
1287                   "pandn     16(%%edi), %%mm7 \n\t"
1288                   "por       %%mm7, %%mm6     \n\t"
1289                   "movq      %%mm6, 16(%%edi) \n\t"
1291                   "movq      24(%%esi), %%mm7 \n\t"
1292                   "pand      %%mm3, %%mm7     \n\t"
1293                   "movq      %%mm3, %%mm6     \n\t"
1294                   "pandn     24(%%edi), %%mm6 \n\t"
1295                   "por       %%mm6, %%mm7     \n\t"
1296                   "movq      %%mm7, 24(%%edi) \n\t"
1298                   "movq      32(%%esi), %%mm6 \n\t"
1299                   "pand      %%mm4, %%mm6     \n\t"
1300                   "movq      %%mm4, %%mm7     \n\t"
1301                   "pandn     32(%%edi), %%mm7 \n\t"
1302                   "por       %%mm7, %%mm6     \n\t"
1303                   "movq      %%mm6, 32(%%edi) \n\t"
1305                   "movq      40(%%esi), %%mm7 \n\t"
1306                   "pand      %%mm5, %%mm7     \n\t"
1307                   "movq      %%mm5, %%mm6     \n\t"
1308                   "pandn     40(%%edi), %%mm6 \n\t"
1309                   "por       %%mm6, %%mm7     \n\t"
1310                   "movq      %%mm7, 40(%%edi) \n\t"
1312                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
1313                   "addl      $48, %%edi       \n\t"
1314                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1316                   "ja        mainloop48       \n\t"
1318                 "mainloop48end:               \n\t"
1319 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1320                   "movl      %%eax, %%ecx     \n\t"
1321                   "cmpl      $0, %%ecx        \n\t"
1322                   "jz        end48            \n\t"
1323 // preload        "movl      mask, %%edx      \n\t"
1324                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1326                 "secondloop48:                \n\t"
1327                   "sall      %%edx            \n\t" // move high bit to CF
1328                   "jnc       skip48           \n\t" // if CF = 0
1329                   "movl      (%%esi), %%eax   \n\t"
1330                   "movl      %%eax, (%%edi)   \n\t"
1332                 "skip48:                      \n\t"
1333                   "addl      $4, %%esi        \n\t"
1334                   "addl      $4, %%edi        \n\t"
1335                   "decl      %%ecx            \n\t"
1336                   "jnz       secondloop48     \n\t"
1338                 "end48:                       \n\t"
1339                   "EMMS                       \n\t" // DONE
1341                   : "=a" (dummy_value_a),           // output regs (dummy)
1342                     "=d" (dummy_value_d),
1343                     "=c" (dummy_value_c),
1344                     "=S" (dummy_value_S),
1345                     "=D" (dummy_value_D)
1347                   : "3" (srcptr),      // esi       // input regs
1348                     "4" (dstptr),      // edi
1349                     "0" (diff),        // eax
1350 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1351                     "2" (len),         // ecx
1352                     "1" (mask)         // edx
1354 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1355                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1356                   , "%mm4", "%mm5", "%mm6", "%mm7"
1357 #endif
1358                );
1359             }
1360             else /* mmx _not supported - Use modified C routine */
1361 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1362             {
1363                register png_uint_32 i;
1364                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1365                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1366                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1367                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1368                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1369                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1370                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1371                int diff = (int) (png_ptr->width & 7); /* amount lost */
1372                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1374                srcptr = png_ptr->row_buf + 1 + initial_val;
1375                dstptr = row + initial_val;
1377                for (i = initial_val; i < final_val; i += stride)
1378                {
1379                   png_memcpy(dstptr, srcptr, rep_bytes);
1380                   srcptr += stride;
1381                   dstptr += stride;
1382                }
1383                if (diff)  /* number of leftover pixels:  3 for pngtest */
1384                {
1385                   final_val+=diff*BPP6;
1386                   for (; i < final_val; i += stride)
1387                   {
1388                      if (rep_bytes > (int)(final_val-i))
1389                         rep_bytes = (int)(final_val-i);
1390                      png_memcpy(dstptr, srcptr, rep_bytes);
1391                      srcptr += stride;
1392                      dstptr += stride;
1393                   }
1394                }
1395             } /* end of else (_mmx_supported) */
1397             break;
1398          }       /* end 48 bpp */
1400          case 64:       /* png_ptr->row_info.pixel_depth */
1401          {
1402             png_bytep srcptr;
1403             png_bytep dstptr;
1404             register png_uint_32 i;
1405             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1406               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1407             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1408               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1409             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1410               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1411             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1412             int diff = (int) (png_ptr->width & 7); /* amount lost */
1413             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1415             srcptr = png_ptr->row_buf + 1 + initial_val;
1416             dstptr = row + initial_val;
1418             for (i = initial_val; i < final_val; i += stride)
1419             {
1420                png_memcpy(dstptr, srcptr, rep_bytes);
1421                srcptr += stride;
1422                dstptr += stride;
1423             }
1424             if (diff)  /* number of leftover pixels:  3 for pngtest */
1425             {
1426                final_val+=diff*BPP8;
1427                for (; i < final_val; i += stride)
1428                {
1429                   if (rep_bytes > (int)(final_val-i))
1430                      rep_bytes = (int)(final_val-i);
1431                   png_memcpy(dstptr, srcptr, rep_bytes);
1432                   srcptr += stride;
1433                   dstptr += stride;
1434                }
1435             }
1437             break;
1438          }       /* end 64 bpp */
1440          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1441          {
1442             /* this should never happen */
1443             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1444             break;
1445          }
1446       } /* end switch (png_ptr->row_info.pixel_depth) */
1448    } /* end if (non-trivial mask) */
1450 } /* end png_combine_row() */
1452 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1457 /*===========================================================================*/
1458 /*                                                                           */
1459 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
1460 /*                                                                           */
1461 /*===========================================================================*/
1463 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1464 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1466 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1467  * has taken place.  [GRR: what other steps come before and/or after?]
1468  */
1470 void /* PRIVATE */
1471 png_do_read_interlace(png_structp png_ptr)
1473    png_row_infop row_info = &(png_ptr->row_info);
1474    png_bytep row = png_ptr->row_buf + 1;
1475    int pass = png_ptr->pass;
1476 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1477    png_uint_32 transformations = png_ptr->transformations;
1478 #endif
1480    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1482 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1483    if (_mmx_supported == 2) {
1484        /* this should have happened in png_init_mmx_flags() already */
1485        png_warning(png_ptr, "asm_flags may not have been initialized");
1486        png_mmx_support();
1487    }
1488 #endif
1490    if (row != NULL && row_info != NULL)
1491    {
1492       png_uint_32 final_width;
1494       final_width = row_info->width * png_pass_inc[pass];
1496       switch (row_info->pixel_depth)
1497       {
1498          case 1:
1499          {
1500             png_bytep sp, dp;
1501             int sshift, dshift;
1502             int s_start, s_end, s_inc;
1503             png_byte v;
1504             png_uint_32 i;
1505             int j;
1507             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1508             dp = row + (png_size_t)((final_width - 1) >> 3);
1509 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1510             if (transformations & PNG_PACKSWAP)
1511             {
1512                sshift = (int)((row_info->width + 7) & 7);
1513                dshift = (int)((final_width + 7) & 7);
1514                s_start = 7;
1515                s_end = 0;
1516                s_inc = -1;
1517             }
1518             else
1519 #endif
1520             {
1521                sshift = 7 - (int)((row_info->width + 7) & 7);
1522                dshift = 7 - (int)((final_width + 7) & 7);
1523                s_start = 0;
1524                s_end = 7;
1525                s_inc = 1;
1526             }
1528             for (i = row_info->width; i; i--)
1529             {
1530                v = (png_byte)((*sp >> sshift) & 0x1);
1531                for (j = 0; j < png_pass_inc[pass]; j++)
1532                {
1533                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1534                   *dp |= (png_byte)(v << dshift);
1535                   if (dshift == s_end)
1536                   {
1537                      dshift = s_start;
1538                      dp--;
1539                   }
1540                   else
1541                      dshift += s_inc;
1542                }
1543                if (sshift == s_end)
1544                {
1545                   sshift = s_start;
1546                   sp--;
1547                }
1548                else
1549                   sshift += s_inc;
1550             }
1551             break;
1552          }
1554          case 2:
1555          {
1556             png_bytep sp, dp;
1557             int sshift, dshift;
1558             int s_start, s_end, s_inc;
1559             png_uint_32 i;
1561             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1562             dp = row + (png_size_t)((final_width - 1) >> 2);
1563 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1564             if (transformations & PNG_PACKSWAP)
1565             {
1566                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1567                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1568                s_start = 6;
1569                s_end = 0;
1570                s_inc = -2;
1571             }
1572             else
1573 #endif
1574             {
1575                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1576                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1577                s_start = 0;
1578                s_end = 6;
1579                s_inc = 2;
1580             }
1582             for (i = row_info->width; i; i--)
1583             {
1584                png_byte v;
1585                int j;
1587                v = (png_byte)((*sp >> sshift) & 0x3);
1588                for (j = 0; j < png_pass_inc[pass]; j++)
1589                {
1590                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1591                   *dp |= (png_byte)(v << dshift);
1592                   if (dshift == s_end)
1593                   {
1594                      dshift = s_start;
1595                      dp--;
1596                   }
1597                   else
1598                      dshift += s_inc;
1599                }
1600                if (sshift == s_end)
1601                {
1602                   sshift = s_start;
1603                   sp--;
1604                }
1605                else
1606                   sshift += s_inc;
1607             }
1608             break;
1609          }
1611          case 4:
1612          {
1613             png_bytep sp, dp;
1614             int sshift, dshift;
1615             int s_start, s_end, s_inc;
1616             png_uint_32 i;
1618             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1619             dp = row + (png_size_t)((final_width - 1) >> 1);
1620 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1621             if (transformations & PNG_PACKSWAP)
1622             {
1623                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1624                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1625                s_start = 4;
1626                s_end = 0;
1627                s_inc = -4;
1628             }
1629             else
1630 #endif
1631             {
1632                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1633                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1634                s_start = 0;
1635                s_end = 4;
1636                s_inc = 4;
1637             }
1639             for (i = row_info->width; i; i--)
1640             {
1641                png_byte v;
1642                int j;
1644                v = (png_byte)((*sp >> sshift) & 0xf);
1645                for (j = 0; j < png_pass_inc[pass]; j++)
1646                {
1647                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1648                   *dp |= (png_byte)(v << dshift);
1649                   if (dshift == s_end)
1650                   {
1651                      dshift = s_start;
1652                      dp--;
1653                   }
1654                   else
1655                      dshift += s_inc;
1656                }
1657                if (sshift == s_end)
1658                {
1659                   sshift = s_start;
1660                   sp--;
1661                }
1662                else
1663                   sshift += s_inc;
1664             }
1665             break;
1666          }
1668        /*====================================================================*/
1670          default: /* 8-bit or larger (this is where the routine is modified) */
1671          {
1672 #if 0
1673 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
1674 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
1675 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
1676 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
1677 #endif
1678             png_bytep sptr, dp;
1679             png_uint_32 i;
1680             png_size_t pixel_bytes;
1681             int width = (int)row_info->width;
1683             pixel_bytes = (row_info->pixel_depth >> 3);
1685             /* point sptr at the last pixel in the pre-expanded row: */
1686             sptr = row + (width - 1) * pixel_bytes;
1688             /* point dp at the last pixel position in the expanded row: */
1689             dp = row + (final_width - 1) * pixel_bytes;
1691             /* New code by Nirav Chhatrapati - Intel Corporation */
1693 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1694             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1695                 /* && _mmx_supported */ )
1696             {
1697                //--------------------------------------------------------------
1698                if (pixel_bytes == 3)
1699                {
1700                   if (((pass == 0) || (pass == 1)) && width)
1701                   {
1702                      int dummy_value_c;   // fix 'forbidden register spilled'
1703                      int dummy_value_S;
1704                      int dummy_value_D;
1706                      __asm__ __volatile__ (
1707                         "subl $21, %%edi         \n\t"
1708                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1710                      ".loop3_pass0:              \n\t"
1711                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1712                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1713                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1714                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1715                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1716                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1717                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1718                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1719                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1720                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
1721                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
1722                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
1723                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
1724                         "movq %%mm4, 16(%%edi)   \n\t"
1725                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
1726                         "movq %%mm3, 8(%%edi)    \n\t"
1727                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
1728                         "subl $3, %%esi          \n\t"
1729                         "movq %%mm0, (%%edi)     \n\t"
1730                         "subl $24, %%edi         \n\t"
1731                         "decl %%ecx              \n\t"
1732                         "jnz .loop3_pass0        \n\t"
1733                         "EMMS                    \n\t" // DONE
1735                         : "=c" (dummy_value_c),        // output regs (dummy)
1736                           "=S" (dummy_value_S),
1737                           "=D" (dummy_value_D)
1739                         : "1" (sptr),      // esi      // input regs
1740                           "2" (dp),        // edi
1741                           "0" (width)      // ecx
1742 // doesn't work           "i" (0x0000000000FFFFFFLL)   // %1 (a.k.a. _const4)
1744 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1745                         : "%mm0", "%mm1", "%mm2"       // clobber list
1746                         , "%mm3", "%mm4"
1747 #endif
1748                      );
1749                   }
1750                   else if (((pass == 2) || (pass == 3)) && width)
1751                   {
1752                      int dummy_value_c;   // fix 'forbidden register spilled'
1753                      int dummy_value_S;
1754                      int dummy_value_D;
1756                      __asm__ __volatile__ (
1757                         "subl $9, %%edi          \n\t"
1758                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1760                      ".loop3_pass2:              \n\t"
1761                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1762                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1763                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1764                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1765                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1766                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1767                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1768                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1769                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1770                         "movq %%mm0, 4(%%edi)    \n\t"
1771                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
1772                         "subl $3, %%esi          \n\t"
1773                         "movd %%mm0, (%%edi)     \n\t"
1774                         "subl $12, %%edi         \n\t"
1775                         "decl %%ecx              \n\t"
1776                         "jnz .loop3_pass2        \n\t"
1777                         "EMMS                    \n\t" // DONE
1779                         : "=c" (dummy_value_c),        // output regs (dummy)
1780                           "=S" (dummy_value_S),
1781                           "=D" (dummy_value_D)
1783                         : "1" (sptr),      // esi      // input regs
1784                           "2" (dp),        // edi
1785                           "0" (width)      // ecx
1787 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1788                         : "%mm0", "%mm1", "%mm2"       // clobber list
1789 #endif
1790                      );
1791                   }
1792                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1793                   {
1794                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
1795                      if (width_mmx < 0)
1796                          width_mmx = 0;
1797                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1798                      if (width_mmx)
1799                      {
1800                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1801                         // sptr points at last pixel in pre-expanded row
1802                         // dp points at last pixel position in expanded row
1803                         int dummy_value_c;  // fix 'forbidden register spilled'
1804                         int dummy_value_S;
1805                         int dummy_value_D;
1807                         __asm__ __volatile__ (
1808                            "subl $3, %%esi          \n\t"
1809                            "subl $9, %%edi          \n\t"
1810                                         // (png_pass_inc[pass] + 1)*pixel_bytes
1812                         ".loop3_pass4:              \n\t"
1813                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
1814                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
1815                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
1816                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
1817                            "pand _const4, %%mm1     \n\t" // z z z z z 2 1 0
1818                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
1819                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
1820                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
1821                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
1822                            "movq %%mm0, (%%edi)     \n\t"
1823                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
1824                            "pand _const6, %%mm3     \n\t" // z z z z z z z 5
1825                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
1826                            "subl $6, %%esi          \n\t"
1827                            "movd %%mm2, 8(%%edi)    \n\t"
1828                            "subl $12, %%edi         \n\t"
1829                            "subl $2, %%ecx          \n\t"
1830                            "jnz .loop3_pass4        \n\t"
1831                            "EMMS                    \n\t" // DONE
1833                            : "=c" (dummy_value_c),        // output regs (dummy)
1834                              "=S" (dummy_value_S),
1835                              "=D" (dummy_value_D)
1837                            : "1" (sptr),      // esi      // input regs
1838                              "2" (dp),        // edi
1839                              "0" (width_mmx)  // ecx
1841 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1842                            : "%mm0", "%mm1"               // clobber list
1843                            , "%mm2", "%mm3"
1844 #endif
1845                         );
1846                      }
1848                      sptr -= width_mmx*3;
1849                      dp -= width_mmx*6;
1850                      for (i = width; i; i--)
1851                      {
1852                         png_byte v[8];
1853                         int j;
1855                         png_memcpy(v, sptr, 3);
1856                         for (j = 0; j < png_pass_inc[pass]; j++)
1857                         {
1858                            png_memcpy(dp, v, 3);
1859                            dp -= 3;
1860                         }
1861                         sptr -= 3;
1862                      }
1863                   }
1864                } /* end of pixel_bytes == 3 */
1866                //--------------------------------------------------------------
1867                else if (pixel_bytes == 1)
1868                {
1869                   if (((pass == 0) || (pass == 1)) && width)
1870                   {
1871                      int width_mmx = ((width >> 2) << 2);
1872                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1873                      if (width_mmx)
1874                      {
1875                         int dummy_value_c;  // fix 'forbidden register spilled'
1876                         int dummy_value_S;
1877                         int dummy_value_D;
1879                         __asm__ __volatile__ (
1880                            "subl $3, %%esi          \n\t"
1881                            "subl $31, %%edi         \n\t"
1883                         ".loop1_pass0:              \n\t"
1884                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1885                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
1886                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1887                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
1888                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1889                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
1890                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
1891                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
1892                            "movq %%mm0, (%%edi)     \n\t"
1893                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
1894                            "movq %%mm3, 8(%%edi)    \n\t"
1895                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
1896                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
1897                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
1898                            "movq %%mm2, 16(%%edi)   \n\t"
1899                            "subl $4, %%esi          \n\t"
1900                            "movq %%mm4, 24(%%edi)   \n\t"
1901                            "subl $32, %%edi         \n\t"
1902                            "subl $4, %%ecx          \n\t"
1903                            "jnz .loop1_pass0        \n\t"
1904                            "EMMS                    \n\t" // DONE
1906                            : "=c" (dummy_value_c),        // output regs (dummy)
1907                              "=S" (dummy_value_S),
1908                              "=D" (dummy_value_D)
1910                            : "1" (sptr),      // esi      // input regs
1911                              "2" (dp),        // edi
1912                              "0" (width_mmx)  // ecx
1914 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1915                            : "%mm0", "%mm1", "%mm2"       // clobber list
1916                            , "%mm3", "%mm4"
1917 #endif
1918                         );
1919                      }
1921                      sptr -= width_mmx;
1922                      dp -= width_mmx*8;
1923                      for (i = width; i; i--)
1924                      {
1925                         int j;
1927                        /* I simplified this part in version 1.0.4e
1928                         * here and in several other instances where
1929                         * pixel_bytes == 1  -- GR-P
1930                         *
1931                         * Original code:
1932                         *
1933                         * png_byte v[8];
1934                         * png_memcpy(v, sptr, pixel_bytes);
1935                         * for (j = 0; j < png_pass_inc[pass]; j++)
1936                         * {
1937                         *    png_memcpy(dp, v, pixel_bytes);
1938                         *    dp -= pixel_bytes;
1939                         * }
1940                         * sptr -= pixel_bytes;
1941                         *
1942                         * Replacement code is in the next three lines:
1943                         */
1945                         for (j = 0; j < png_pass_inc[pass]; j++)
1946                         {
1947                            *dp-- = *sptr;
1948                         }
1949                         --sptr;
1950                      }
1951                   }
1952                   else if (((pass == 2) || (pass == 3)) && width)
1953                   {
1954                      int width_mmx = ((width >> 2) << 2);
1955                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1956                      if (width_mmx)
1957                      {
1958                         int dummy_value_c;  // fix 'forbidden register spilled'
1959                         int dummy_value_S;
1960                         int dummy_value_D;
1962                         __asm__ __volatile__ (
1963                            "subl $3, %%esi          \n\t"
1964                            "subl $15, %%edi         \n\t"
1966                         ".loop1_pass2:              \n\t"
1967                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1968                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1969                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
1970                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1971                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
1972                            "movq %%mm0, (%%edi)     \n\t"
1973                            "subl $4, %%esi          \n\t"
1974                            "movq %%mm1, 8(%%edi)    \n\t"
1975                            "subl $16, %%edi         \n\t"
1976                            "subl $4, %%ecx          \n\t"
1977                            "jnz .loop1_pass2        \n\t"
1978                            "EMMS                    \n\t" // DONE
1980                            : "=c" (dummy_value_c),        // output regs (dummy)
1981                              "=S" (dummy_value_S),
1982                              "=D" (dummy_value_D)
1984                            : "1" (sptr),      // esi      // input regs
1985                              "2" (dp),        // edi
1986                              "0" (width_mmx)  // ecx
1988 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1989                            : "%mm0", "%mm1"               // clobber list
1990 #endif
1991                         );
1992                      }
1994                      sptr -= width_mmx;
1995                      dp -= width_mmx*4;
1996                      for (i = width; i; i--)
1997                      {
1998                         int j;
2000                         for (j = 0; j < png_pass_inc[pass]; j++)
2001                         {
2002                            *dp-- = *sptr;
2003                         }
2004                         --sptr;
2005                      }
2006                   }
2007                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
2008                   {
2009                      int width_mmx = ((width >> 3) << 3);
2010                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2011                      if (width_mmx)
2012                      {
2013                         int dummy_value_c;  // fix 'forbidden register spilled'
2014                         int dummy_value_S;
2015                         int dummy_value_D;
2017                         __asm__ __volatile__ (
2018                            "subl $7, %%esi          \n\t"
2019                            "subl $15, %%edi         \n\t"
2021                         ".loop1_pass4:              \n\t"
2022                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2023                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2024                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2025                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
2026                            "movq %%mm1, 8(%%edi)    \n\t"
2027                            "subl $8, %%esi          \n\t"
2028                            "movq %%mm0, (%%edi)     \n\t"
2029                            "subl $16, %%edi         \n\t"
2030                            "subl $8, %%ecx          \n\t"
2031                            "jnz .loop1_pass4        \n\t"
2032                            "EMMS                    \n\t" // DONE
2034                            : "=c" (dummy_value_c),        // output regs (none)
2035                              "=S" (dummy_value_S),
2036                              "=D" (dummy_value_D)
2038                            : "1" (sptr),      // esi      // input regs
2039                              "2" (dp),        // edi
2040                              "0" (width_mmx)  // ecx
2042 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2043                            : "%mm0", "%mm1"               // clobber list
2044 #endif
2045                         );
2046                      }
2048                      sptr -= width_mmx;
2049                      dp -= width_mmx*2;
2050                      for (i = width; i; i--)
2051                      {
2052                         int j;
2054                         for (j = 0; j < png_pass_inc[pass]; j++)
2055                         {
2056                            *dp-- = *sptr;
2057                         }
2058                         --sptr;
2059                      }
2060                   }
2061                } /* end of pixel_bytes == 1 */
2063                //--------------------------------------------------------------
2064                else if (pixel_bytes == 2)
2065                {
2066                   if (((pass == 0) || (pass == 1)) && width)
2067                   {
2068                      int width_mmx = ((width >> 1) << 1);
2069                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2070                      if (width_mmx)
2071                      {
2072                         int dummy_value_c;  // fix 'forbidden register spilled'
2073                         int dummy_value_S;
2074                         int dummy_value_D;
2076                         __asm__ __volatile__ (
2077                            "subl $2, %%esi          \n\t"
2078                            "subl $30, %%edi         \n\t"
2080                         ".loop2_pass0:              \n\t"
2081                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2082                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2083                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2084                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2085                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2086                            "movq %%mm0, (%%edi)     \n\t"
2087                            "movq %%mm0, 8(%%edi)    \n\t"
2088                            "movq %%mm1, 16(%%edi)   \n\t"
2089                            "subl $4, %%esi          \n\t"
2090                            "movq %%mm1, 24(%%edi)   \n\t"
2091                            "subl $32, %%edi         \n\t"
2092                            "subl $2, %%ecx          \n\t"
2093                            "jnz .loop2_pass0        \n\t"
2094                            "EMMS                    \n\t" // DONE
2096                            : "=c" (dummy_value_c),        // output regs (dummy)
2097                              "=S" (dummy_value_S),
2098                              "=D" (dummy_value_D)
2100                            : "1" (sptr),      // esi      // input regs
2101                              "2" (dp),        // edi
2102                              "0" (width_mmx)  // ecx
2104 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2105                            : "%mm0", "%mm1"               // clobber list
2106 #endif
2107                         );
2108                      }
2110                      sptr -= (width_mmx*2 - 2); // sign fixed
2111                      dp -= (width_mmx*16 - 2);  // sign fixed
2112                      for (i = width; i; i--)
2113                      {
2114                         png_byte v[8];
2115                         int j;
2116                         sptr -= 2;
2117                         png_memcpy(v, sptr, 2);
2118                         for (j = 0; j < png_pass_inc[pass]; j++)
2119                         {
2120                            dp -= 2;
2121                            png_memcpy(dp, v, 2);
2122                         }
2123                      }
2124                   }
2125                   else if (((pass == 2) || (pass == 3)) && width)
2126                   {
2127                      int width_mmx = ((width >> 1) << 1) ;
2128                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2129                      if (width_mmx)
2130                      {
2131                         int dummy_value_c;  // fix 'forbidden register spilled'
2132                         int dummy_value_S;
2133                         int dummy_value_D;
2135                         __asm__ __volatile__ (
2136                            "subl $2, %%esi          \n\t"
2137                            "subl $14, %%edi         \n\t"
2139                         ".loop2_pass2:              \n\t"
2140                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2141                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2142                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2143                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2144                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2145                            "movq %%mm0, (%%edi)     \n\t"
2146                            "subl $4, %%esi          \n\t"
2147                            "movq %%mm1, 8(%%edi)    \n\t"
2148                            "subl $16, %%edi         \n\t"
2149                            "subl $2, %%ecx          \n\t"
2150                            "jnz .loop2_pass2        \n\t"
2151                            "EMMS                    \n\t" // DONE
2153                            : "=c" (dummy_value_c),        // output regs (dummy)
2154                              "=S" (dummy_value_S),
2155                              "=D" (dummy_value_D)
2157                            : "1" (sptr),      // esi      // input regs
2158                              "2" (dp),        // edi
2159                              "0" (width_mmx)  // ecx
2161 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2162                            : "%mm0", "%mm1"               // clobber list
2163 #endif
2164                         );
2165                      }
2167                      sptr -= (width_mmx*2 - 2); // sign fixed
2168                      dp -= (width_mmx*8 - 2);   // sign fixed
2169                      for (i = width; i; i--)
2170                      {
2171                         png_byte v[8];
2172                         int j;
2173                         sptr -= 2;
2174                         png_memcpy(v, sptr, 2);
2175                         for (j = 0; j < png_pass_inc[pass]; j++)
2176                         {
2177                            dp -= 2;
2178                            png_memcpy(dp, v, 2);
2179                         }
2180                      }
2181                   }
2182                   else if (width)  // pass == 4 or 5
2183                   {
2184                      int width_mmx = ((width >> 1) << 1) ;
2185                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2186                      if (width_mmx)
2187                      {
2188                         int dummy_value_c;  // fix 'forbidden register spilled'
2189                         int dummy_value_S;
2190                         int dummy_value_D;
2192                         __asm__ __volatile__ (
2193                            "subl $2, %%esi          \n\t"
2194                            "subl $6, %%edi          \n\t"
2196                         ".loop2_pass4:              \n\t"
2197                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2198                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2199                            "subl $4, %%esi          \n\t"
2200                            "movq %%mm0, (%%edi)     \n\t"
2201                            "subl $8, %%edi          \n\t"
2202                            "subl $2, %%ecx          \n\t"
2203                            "jnz .loop2_pass4        \n\t"
2204                            "EMMS                    \n\t" // DONE
2206                            : "=c" (dummy_value_c),        // output regs (dummy)
2207                              "=S" (dummy_value_S),
2208                              "=D" (dummy_value_D)
2210                            : "1" (sptr),      // esi      // input regs
2211                              "2" (dp),        // edi
2212                              "0" (width_mmx)  // ecx
2214 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2215                            : "%mm0"                       // clobber list
2216 #endif
2217                         );
2218                      }
2220                      sptr -= (width_mmx*2 - 2); // sign fixed
2221                      dp -= (width_mmx*4 - 2);   // sign fixed
2222                      for (i = width; i; i--)
2223                      {
2224                         png_byte v[8];
2225                         int j;
2226                         sptr -= 2;
2227                         png_memcpy(v, sptr, 2);
2228                         for (j = 0; j < png_pass_inc[pass]; j++)
2229                         {
2230                            dp -= 2;
2231                            png_memcpy(dp, v, 2);
2232                         }
2233                      }
2234                   }
2235                } /* end of pixel_bytes == 2 */
2237                //--------------------------------------------------------------
2238                else if (pixel_bytes == 4)
2239                {
2240                   if (((pass == 0) || (pass == 1)) && width)
2241                   {
2242                      int width_mmx = ((width >> 1) << 1);
2243                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2244                      if (width_mmx)
2245                      {
2246                         int dummy_value_c;  // fix 'forbidden register spilled'
2247                         int dummy_value_S;
2248                         int dummy_value_D;
2250                         __asm__ __volatile__ (
2251                            "subl $4, %%esi          \n\t"
2252                            "subl $60, %%edi         \n\t"
2254                         ".loop4_pass0:              \n\t"
2255                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2256                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2257                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2258                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2259                            "movq %%mm0, (%%edi)     \n\t"
2260                            "movq %%mm0, 8(%%edi)    \n\t"
2261                            "movq %%mm0, 16(%%edi)   \n\t"
2262                            "movq %%mm0, 24(%%edi)   \n\t"
2263                            "movq %%mm1, 32(%%edi)   \n\t"
2264                            "movq %%mm1, 40(%%edi)   \n\t"
2265                            "movq %%mm1, 48(%%edi)   \n\t"
2266                            "subl $8, %%esi          \n\t"
2267                            "movq %%mm1, 56(%%edi)   \n\t"
2268                            "subl $64, %%edi         \n\t"
2269                            "subl $2, %%ecx          \n\t"
2270                            "jnz .loop4_pass0        \n\t"
2271                            "EMMS                    \n\t" // DONE
2273                            : "=c" (dummy_value_c),        // output regs (dummy)
2274                              "=S" (dummy_value_S),
2275                              "=D" (dummy_value_D)
2277                            : "1" (sptr),      // esi      // input regs
2278                              "2" (dp),        // edi
2279                              "0" (width_mmx)  // ecx
2281 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2282                            : "%mm0", "%mm1"               // clobber list
2283 #endif
2284                         );
2285                      }
2287                      sptr -= (width_mmx*4 - 4); // sign fixed
2288                      dp -= (width_mmx*32 - 4);  // sign fixed
2289                      for (i = width; i; i--)
2290                      {
2291                         png_byte v[8];
2292                         int j;
2293                         sptr -= 4;
2294                         png_memcpy(v, sptr, 4);
2295                         for (j = 0; j < png_pass_inc[pass]; j++)
2296                         {
2297                            dp -= 4;
2298                            png_memcpy(dp, v, 4);
2299                         }
2300                      }
2301                   }
2302                   else if (((pass == 2) || (pass == 3)) && width)
2303                   {
2304                      int width_mmx = ((width >> 1) << 1);
2305                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2306                      if (width_mmx)
2307                      {
2308                         int dummy_value_c;  // fix 'forbidden register spilled'
2309                         int dummy_value_S;
2310                         int dummy_value_D;
2312                         __asm__ __volatile__ (
2313                            "subl $4, %%esi          \n\t"
2314                            "subl $28, %%edi         \n\t"
2316                         ".loop4_pass2:              \n\t"
2317                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2318                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2319                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2320                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2321                            "movq %%mm0, (%%edi)     \n\t"
2322                            "movq %%mm0, 8(%%edi)    \n\t"
2323                            "movq %%mm1, 16(%%edi)   \n\t"
2324                            "movq %%mm1, 24(%%edi)   \n\t"
2325                            "subl $8, %%esi          \n\t"
2326                            "subl $32, %%edi         \n\t"
2327                            "subl $2, %%ecx          \n\t"
2328                            "jnz .loop4_pass2        \n\t"
2329                            "EMMS                    \n\t" // DONE
2331                            : "=c" (dummy_value_c),        // output regs (dummy)
2332                              "=S" (dummy_value_S),
2333                              "=D" (dummy_value_D)
2335                            : "1" (sptr),      // esi      // input regs
2336                              "2" (dp),        // edi
2337                              "0" (width_mmx)  // ecx
2339 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2340                            : "%mm0", "%mm1"               // clobber list
2341 #endif
2342                         );
2343                      }
2345                      sptr -= (width_mmx*4 - 4); // sign fixed
2346                      dp -= (width_mmx*16 - 4);  // sign fixed
2347                      for (i = width; i; i--)
2348                      {
2349                         png_byte v[8];
2350                         int j;
2351                         sptr -= 4;
2352                         png_memcpy(v, sptr, 4);
2353                         for (j = 0; j < png_pass_inc[pass]; j++)
2354                         {
2355                            dp -= 4;
2356                            png_memcpy(dp, v, 4);
2357                         }
2358                      }
2359                   }
2360                   else if (width)  // pass == 4 or 5
2361                   {
2362                      int width_mmx = ((width >> 1) << 1) ;
2363                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2364                      if (width_mmx)
2365                      {
2366                         int dummy_value_c;  // fix 'forbidden register spilled'
2367                         int dummy_value_S;
2368                         int dummy_value_D;
2370                         __asm__ __volatile__ (
2371                            "subl $4, %%esi          \n\t"
2372                            "subl $12, %%edi         \n\t"
2374                         ".loop4_pass4:              \n\t"
2375                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2376                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2377                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2378                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2379                            "movq %%mm0, (%%edi)     \n\t"
2380                            "subl $8, %%esi          \n\t"
2381                            "movq %%mm1, 8(%%edi)    \n\t"
2382                            "subl $16, %%edi         \n\t"
2383                            "subl $2, %%ecx          \n\t"
2384                            "jnz .loop4_pass4        \n\t"
2385                            "EMMS                    \n\t" // DONE
2387                            : "=c" (dummy_value_c),        // output regs (dummy)
2388                              "=S" (dummy_value_S),
2389                              "=D" (dummy_value_D)
2391                            : "1" (sptr),      // esi      // input regs
2392                              "2" (dp),        // edi
2393                              "0" (width_mmx)  // ecx
2395 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2396                            : "%mm0", "%mm1"               // clobber list
2397 #endif
2398                         );
2399                      }
2401                      sptr -= (width_mmx*4 - 4); // sign fixed
2402                      dp -= (width_mmx*8 - 4);   // sign fixed
2403                      for (i = width; i; i--)
2404                      {
2405                         png_byte v[8];
2406                         int j;
2407                         sptr -= 4;
2408                         png_memcpy(v, sptr, 4);
2409                         for (j = 0; j < png_pass_inc[pass]; j++)
2410                         {
2411                            dp -= 4;
2412                            png_memcpy(dp, v, 4);
2413                         }
2414                      }
2415                   }
2416                } /* end of pixel_bytes == 4 */
2418                //--------------------------------------------------------------
2419                else if (pixel_bytes == 8)
2420                {
2421 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2422                   // GRR NOTE:  no need to combine passes here!
2423                   if (((pass == 0) || (pass == 1)) && width)
2424                   {
2425                      int dummy_value_c;  // fix 'forbidden register spilled'
2426                      int dummy_value_S;
2427                      int dummy_value_D;
2429                      // source is 8-byte RRGGBBAA
2430                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2431                      __asm__ __volatile__ (
2432                         "subl $56, %%edi         \n\t" // start of last block
2434                      ".loop8_pass0:              \n\t"
2435                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2436                         "movq %%mm0, (%%edi)     \n\t"
2437                         "movq %%mm0, 8(%%edi)    \n\t"
2438                         "movq %%mm0, 16(%%edi)   \n\t"
2439                         "movq %%mm0, 24(%%edi)   \n\t"
2440                         "movq %%mm0, 32(%%edi)   \n\t"
2441                         "movq %%mm0, 40(%%edi)   \n\t"
2442                         "movq %%mm0, 48(%%edi)   \n\t"
2443                         "subl $8, %%esi          \n\t"
2444                         "movq %%mm0, 56(%%edi)   \n\t"
2445                         "subl $64, %%edi         \n\t"
2446                         "decl %%ecx              \n\t"
2447                         "jnz .loop8_pass0        \n\t"
2448                         "EMMS                    \n\t" // DONE
2450                         : "=c" (dummy_value_c),        // output regs (dummy)
2451                           "=S" (dummy_value_S),
2452                           "=D" (dummy_value_D)
2454                         : "1" (sptr),      // esi      // input regs
2455                           "2" (dp),        // edi
2456                           "0" (width)      // ecx
2458 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2459                         : "%mm0"                       // clobber list
2460 #endif
2461                      );
2462                   }
2463                   else if (((pass == 2) || (pass == 3)) && width)
2464                   {
2465                      // source is 8-byte RRGGBBAA
2466                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2467                      int width_mmx = ((width >> 1) << 1) ;
2468                      width -= width_mmx;
2469                      if (width_mmx)
2470                      {
2471                         int dummy_value_c;  // fix 'forbidden register spilled'
2472                         int dummy_value_S;
2473                         int dummy_value_D;
2475                         __asm__ __volatile__ (
2476                            "subl $24, %%edi         \n\t" // start of last block
2478                         ".loop8_pass2:              \n\t"
2479                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2480                            "movq %%mm0, (%%edi)     \n\t"
2481                            "movq %%mm0, 8(%%edi)    \n\t"
2482                            "movq %%mm0, 16(%%edi)   \n\t"
2483                            "subl $8, %%esi          \n\t"
2484                            "movq %%mm0, 24(%%edi)   \n\t"
2485                            "subl $32, %%edi         \n\t"
2486                            "decl %%ecx              \n\t"
2487                            "jnz .loop8_pass2        \n\t"
2488                            "EMMS                    \n\t" // DONE
2490                            : "=c" (dummy_value_c),        // output regs (dummy)
2491                              "=S" (dummy_value_S),
2492                              "=D" (dummy_value_D)
2494                            : "1" (sptr),      // esi      // input regs
2495                              "2" (dp),        // edi
2496                              "0" (width)      // ecx
2498 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2499                            : "%mm0"                       // clobber list
2500 #endif
2501                         );
2502                      }
2503                   }
2504                   else if (width)  // pass == 4 or 5
2505                   {
2506                      // source is 8-byte RRGGBBAA
2507                      // dest is 16-byte RRGGBBAA RRGGBBAA
2508                      int width_mmx = ((width >> 1) << 1) ;
2509                      width -= width_mmx;
2510                      if (width_mmx)
2511                      {
2512                         int dummy_value_c;  // fix 'forbidden register spilled'
2513                         int dummy_value_S;
2514                         int dummy_value_D;
2516                         __asm__ __volatile__ (
2517                            "subl $8, %%edi          \n\t" // start of last block
2519                         ".loop8_pass4:              \n\t"
2520                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2521                            "movq %%mm0, (%%edi)     \n\t"
2522                            "subl $8, %%esi          \n\t"
2523                            "movq %%mm0, 8(%%edi)    \n\t"
2524                            "subl $16, %%edi         \n\t"
2525                            "decl %%ecx              \n\t"
2526                            "jnz .loop8_pass4        \n\t"
2527                            "EMMS                    \n\t" // DONE
2529                            : "=c" (dummy_value_c),        // output regs (dummy)
2530                              "=S" (dummy_value_S),
2531                              "=D" (dummy_value_D)
2533                            : "1" (sptr),      // esi      // input regs
2534                              "2" (dp),        // edi
2535                              "0" (width)      // ecx
2537 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2538                            : "%mm0"                       // clobber list
2539 #endif
2540                         );
2541                      }
2542                   }
2544                } /* end of pixel_bytes == 8 */
2546                //--------------------------------------------------------------
2547                else if (pixel_bytes == 6)
2548                {
2549                   for (i = width; i; i--)
2550                   {
2551                      png_byte v[8];
2552                      int j;
2553                      png_memcpy(v, sptr, 6);
2554                      for (j = 0; j < png_pass_inc[pass]; j++)
2555                      {
2556                         png_memcpy(dp, v, 6);
2557                         dp -= 6;
2558                      }
2559                      sptr -= 6;
2560                   }
2561                } /* end of pixel_bytes == 6 */
2563                //--------------------------------------------------------------
2564                else
2565                {
2566                   for (i = width; i; i--)
2567                   {
2568                      png_byte v[8];
2569                      int j;
2570                      png_memcpy(v, sptr, pixel_bytes);
2571                      for (j = 0; j < png_pass_inc[pass]; j++)
2572                      {
2573                         png_memcpy(dp, v, pixel_bytes);
2574                         dp -= pixel_bytes;
2575                      }
2576                      sptr-= pixel_bytes;
2577                   }
2578                }
2579             } // end of _mmx_supported ========================================
2581             else /* MMX not supported:  use modified C code - takes advantage
2582                   *   of inlining of png_memcpy for a constant */
2583                  /* GRR 19991007:  does it?  or should pixel_bytes in each
2584                   *   block be replaced with immediate value (e.g., 1)? */
2585                  /* GRR 19991017:  replaced with constants in each case */
2586 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2587             {
2588                if (pixel_bytes == 1)
2589                {
2590                   for (i = width; i; i--)
2591                   {
2592                      int j;
2593                      for (j = 0; j < png_pass_inc[pass]; j++)
2594                      {
2595                         *dp-- = *sptr;
2596                      }
2597                      --sptr;
2598                   }
2599                }
2600                else if (pixel_bytes == 3)
2601                {
2602                   for (i = width; i; i--)
2603                   {
2604                      png_byte v[8];
2605                      int j;
2606                      png_memcpy(v, sptr, 3);
2607                      for (j = 0; j < png_pass_inc[pass]; j++)
2608                      {
2609                         png_memcpy(dp, v, 3);
2610                         dp -= 3;
2611                      }
2612                      sptr -= 3;
2613                   }
2614                }
2615                else if (pixel_bytes == 2)
2616                {
2617                   for (i = width; i; i--)
2618                   {
2619                      png_byte v[8];
2620                      int j;
2621                      png_memcpy(v, sptr, 2);
2622                      for (j = 0; j < png_pass_inc[pass]; j++)
2623                      {
2624                         png_memcpy(dp, v, 2);
2625                         dp -= 2;
2626                      }
2627                      sptr -= 2;
2628                   }
2629                }
2630                else if (pixel_bytes == 4)
2631                {
2632                   for (i = width; i; i--)
2633                   {
2634                      png_byte v[8];
2635                      int j;
2636                      png_memcpy(v, sptr, 4);
2637                      for (j = 0; j < png_pass_inc[pass]; j++)
2638                      {
2639 #ifdef PNG_DEBUG
2640                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2641                         {
2642                            printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2643                              row, dp, row+png_ptr->row_buf_size);
2644                            printf("row_buf=%d\n",png_ptr->row_buf_size);
2645                         }
2646 #endif
2647                         png_memcpy(dp, v, 4);
2648                         dp -= 4;
2649                      }
2650                      sptr -= 4;
2651                   }
2652                }
2653                else if (pixel_bytes == 6)
2654                {
2655                   for (i = width; i; i--)
2656                   {
2657                      png_byte v[8];
2658                      int j;
2659                      png_memcpy(v, sptr, 6);
2660                      for (j = 0; j < png_pass_inc[pass]; j++)
2661                      {
2662                         png_memcpy(dp, v, 6);
2663                         dp -= 6;
2664                      }
2665                      sptr -= 6;
2666                   }
2667                }
2668                else if (pixel_bytes == 8)
2669                {
2670                   for (i = width; i; i--)
2671                   {
2672                      png_byte v[8];
2673                      int j;
2674                      png_memcpy(v, sptr, 8);
2675                      for (j = 0; j < png_pass_inc[pass]; j++)
2676                      {
2677                         png_memcpy(dp, v, 8);
2678                         dp -= 8;
2679                      }
2680                      sptr -= 8;
2681                   }
2682                }
2683                else     /* GRR:  should never be reached */
2684                {
2685                   for (i = width; i; i--)
2686                   {
2687                      png_byte v[8];
2688                      int j;
2689                      png_memcpy(v, sptr, pixel_bytes);
2690                      for (j = 0; j < png_pass_inc[pass]; j++)
2691                      {
2692                         png_memcpy(dp, v, pixel_bytes);
2693                         dp -= pixel_bytes;
2694                      }
2695                      sptr -= pixel_bytes;
2696                   }
2697                }
2699             } /* end if (MMX not supported) */
2700             break;
2701          }
2702       } /* end switch (row_info->pixel_depth) */
2704       row_info->width = final_width;
2705       row_info->rowbytes = ((final_width *
2706          (png_uint_32)row_info->pixel_depth + 7) >> 3);
2707    }
2709 } /* end png_do_read_interlace() */
2711 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2712 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2716 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2717 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2719 // These variables are utilized in the functions below.  They are declared
2720 // globally here to ensure alignment on 8-byte boundaries.
2722 union uAll {
2723    long long use;
2724    double  align;
2725 } _LBCarryMask = {0x0101010101010101LL},
2726   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2727   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2729 #ifdef PNG_THREAD_UNSAFE_OK
2730 //===========================================================================//
2731 //                                                                           //
2732 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
2733 //                                                                           //
2734 //===========================================================================//
2736 // Optimized code for PNG Average filter decoder
2738 static void /* PRIVATE */
2739 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2740                             png_bytep prev_row)
2742    int bpp;
2743    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
2744    int dummy_value_S;
2745    int dummy_value_D;
2747    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
2748    _FullLength  = row_info->rowbytes;       // # of bytes to filter
2750    __asm__ __volatile__ (
2751       // initialize address pointers and offset
2752 #ifdef __PIC__
2753       "pushl %%ebx                 \n\t" // save index to Global Offset Table
2754 #endif
2755 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
2756       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
2757       "movl %%edi, %%edx           \n\t"
2758 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2759 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2760       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2762       "xorl %%eax,%%eax            \n\t"
2764       // Compute the Raw value for the first bpp bytes
2765       //    Raw(x) = Avg(x) + (Prior(x)/2)
2766    "avg_rlp:                       \n\t"
2767       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
2768       "incl %%ebx                  \n\t"
2769       "shrb %%al                   \n\t" // divide by 2
2770       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
2771 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
2772       "cmpl %%ecx, %%ebx           \n\t"
2773       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
2774       "jb avg_rlp                  \n\t" // mov does not affect flags
2776       // get # of bytes to alignment
2777       "movl %%edi, _dif            \n\t" // take start of row
2778       "addl %%ebx, _dif            \n\t" // add bpp
2779       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
2780       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
2781       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
2782       "jz avg_go                   \n\t" //  alignment
2784       // fix alignment
2785       // Compute the Raw value for the bytes up to the alignment boundary
2786       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2787       "xorl %%ecx, %%ecx           \n\t"
2789    "avg_lp1:                       \n\t"
2790       "xorl %%eax, %%eax           \n\t"
2791       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2792       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2793       "addw %%cx, %%ax             \n\t"
2794       "incl %%ebx                  \n\t"
2795       "shrw %%ax                   \n\t" // divide by 2
2796       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2797       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
2798       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2799       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
2801    "avg_go:                        \n\t"
2802       "movl _FullLength, %%eax     \n\t"
2803       "movl %%eax, %%ecx           \n\t"
2804       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
2805       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
2806       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
2807       "movl %%ecx, _MMXLength      \n\t"
2808 #ifdef __PIC__
2809       "popl %%ebx                  \n\t" // restore index to Global Offset Table
2810 #endif
2812       : "=c" (dummy_value_c),            // output regs (dummy)
2813         "=S" (dummy_value_S),
2814         "=D" (dummy_value_D)
2816       : "0" (bpp),       // ecx          // input regs
2817         "1" (prev_row),  // esi
2818         "2" (row)        // edi
2820       : "%eax", "%edx"                   // clobber list
2821 #ifndef __PIC__
2822       , "%ebx"
2823 #endif
2824       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2825       // (seems to work fine without...)
2826    );
2828    // now do the math for the rest of the row
2829    switch (bpp)
2830    {
2831       case 3:
2832       {
2833          _ActiveMask.use  = 0x0000000000ffffffLL;
2834          _ShiftBpp.use = 24;    // == 3 * 8
2835          _ShiftRem.use = 40;    // == 64 - 24
2837          __asm__ __volatile__ (
2838             // re-init address pointers and offset
2839             "movq _ActiveMask, %%mm7      \n\t"
2840             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
2841             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
2842 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
2843             "movq _HBClearMask, %%mm4     \n\t"
2844 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
2846             // prime the pump:  load the first Raw(x-bpp) data set
2847             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2848                                                 // (correct pos. in loop below)
2849          "avg_3lp:                        \n\t"
2850             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
2851             "movq %%mm5, %%mm3            \n\t"
2852             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp)
2853                                                 // data
2854             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
2855             "movq %%mm7, %%mm6            \n\t"
2856             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
2857             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
2858             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
2859                                                 // byte
2860             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
2861                                                 // each byte
2862             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2863             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2864                                                 // LBCarrys
2865             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2866                                                 // where both
2867                                // lsb's were == 1 (only valid for active group)
2868             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2869             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2870                                                 // byte
2871             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2872                                                 // for each byte
2873             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1
2874                                                 // bytes to add to Avg
2875             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2876                                                 // Avg for each Active
2877                                //  byte
2878             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2879             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover
2880                                                 // bytes 3-5
2881             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2882             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2883             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2884                                                 // LBCarrys
2885             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2886                                                 // where both
2887                                // lsb's were == 1 (only valid for active group)
2888             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2889             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2890                                                 // byte
2891             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2892                                                 // for each byte
2893             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2894                                                 // bytes to add to Avg
2895             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2896                                                 // Avg for each Active
2897                                //  byte
2899             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2900             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last
2901                                                 // two
2902                                  // bytes
2903             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2904             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2905                               // Data only needs to be shifted once here to
2906                               // get the correct x-bpp offset.
2907             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2908                                                 // LBCarrys
2909             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2910                                                 // where both
2911                               // lsb's were == 1 (only valid for active group)
2912             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2913             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2914                                                 // byte
2915             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2916                                                 // for each byte
2917             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2918                                                 // bytes to add to Avg
2919             "addl $8, %%ecx               \n\t"
2920             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2921                                                 // Avg for each Active
2922                                                 // byte
2923             // now ready to write back to memory
2924             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2925             // move updated Raw(x) to use as Raw(x-bpp) for next loop
2926             "cmpl _MMXLength, %%ecx       \n\t"
2927             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
2928             "jb avg_3lp                   \n\t"
2930             : "=S" (dummy_value_S),             // output regs (dummy)
2931               "=D" (dummy_value_D)
2933             : "0" (prev_row),  // esi           // input regs
2934               "1" (row)        // edi
2936             : "%ecx"                            // clobber list
2937 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2938             , "%mm0", "%mm1", "%mm2", "%mm3"
2939             , "%mm4", "%mm5", "%mm6", "%mm7"
2940 #endif
2941          );
2942       }
2943       break;  // end 3 bpp
2945       case 6:
2946       case 4:
2947       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
2948       //case 5:   // GRR BOGUS
2949       {
2950          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
2951                                                   // appropriate inactive bytes
2952          _ShiftBpp.use = bpp << 3;
2953          _ShiftRem.use = 64 - _ShiftBpp.use;
2955          __asm__ __volatile__ (
2956             "movq _HBClearMask, %%mm4    \n\t"
2958             // re-init address pointers and offset
2959             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to
2960                                                // alignment boundary
2962             // load _ActiveMask and clear all bytes except for 1st active group
2963             "movq _ActiveMask, %%mm7     \n\t"
2964 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2965             "psrlq _ShiftRem, %%mm7      \n\t"
2966 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2967             "movq %%mm7, %%mm6           \n\t"
2968             "movq _LBCarryMask, %%mm5    \n\t"
2969             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active
2970                                                // group
2972             // prime the pump:  load the first Raw(x-bpp) data set
2973             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2974                                           // (we correct pos. in loop below)
2975          "avg_4lp:                       \n\t"
2976             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2977             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
2978             "movq (%%esi,%%ecx,), %%mm1  \n\t"
2979             // add (Prev_row/2) to average
2980             "movq %%mm5, %%mm3           \n\t"
2981             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
2982             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
2983             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
2984                                                // byte
2985             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
2986                                                // each byte
2987             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2988             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
2989                                                // LBCarrys
2990             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
2991                                                // where both
2992                               // lsb's were == 1 (only valid for active group)
2993             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2994             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
2995                                                // byte
2996             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2997                                                // for each byte
2998             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
2999                                                // bytes to add to Avg
3000             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3001                                                // for each Active
3002                               // byte
3003             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3004             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3005             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3006             "addl $8, %%ecx              \n\t"
3007             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3008                                                // LBCarrys
3009             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3010                                                // where both
3011                               // lsb's were == 1 (only valid for active group)
3012             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3013             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3014                                                // byte
3015             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3016                                                // for each byte
3017             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3018                                                // bytes to add to Avg
3019             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3020                                                // Avg for each Active
3021                               // byte
3022             "cmpl _MMXLength, %%ecx      \n\t"
3023             // now ready to write back to memory
3024             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3025             // prep Raw(x-bpp) for next loop
3026             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3027             "jb avg_4lp                  \n\t"
3029             : "=S" (dummy_value_S),            // output regs (dummy)
3030               "=D" (dummy_value_D)
3032             : "0" (prev_row),  // esi          // input regs
3033               "1" (row)        // edi
3035             : "%ecx"                           // clobber list
3036 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3037             , "%mm0", "%mm1", "%mm2", "%mm3"
3038             , "%mm4", "%mm5", "%mm6", "%mm7"
3039 #endif
3040          );
3041       }
3042       break;  // end 4,6 bpp
3044       case 2:
3045       {
3046          _ActiveMask.use  = 0x000000000000ffffLL;
3047          _ShiftBpp.use = 16;   // == 2 * 8
3048          _ShiftRem.use = 48;   // == 64 - 16
3050          __asm__ __volatile__ (
3051             // load _ActiveMask
3052             "movq _ActiveMask, %%mm7     \n\t"
3053             // re-init address pointers and offset
3054             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment
3055                                                // boundary
3056             "movq _LBCarryMask, %%mm5    \n\t"
3057 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3058             "movq _HBClearMask, %%mm4    \n\t"
3059 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3061             // prime the pump:  load the first Raw(x-bpp) data set
3062             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3063                               // (we correct pos. in loop below)
3064          "avg_2lp:                       \n\t"
3065             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3066             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
3067             "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
3068             // add (Prev_row/2) to average
3069             "movq %%mm5, %%mm3           \n\t"
3070             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3071             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3072             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3073                                                // byte
3074             "movq %%mm7, %%mm6           \n\t"
3075             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3076                                                // each byte
3078             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3079             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3080                                                // LBCarrys
3081             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3082                                                // where both
3083                                                // lsb's were == 1 (only valid
3084                                                // for active group)
3085             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3086             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3087                                                // byte
3088             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3089                                                // for each byte
3090             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
3091                                                // bytes to add to Avg
3092             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3093                                                // for each Active byte
3095             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3096             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3097                                                // bytes 2 & 3
3098             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3099             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3100             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3101                                                // LBCarrys
3102             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3103                                                // where both
3104                                                // lsb's were == 1 (only valid
3105                                                // for active group)
3106             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3107             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3108                                                // byte
3109             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3110                                                // for each byte
3111             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3112                                                // bytes to add to Avg
3113             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3114                                                // Avg for each Active byte
3116             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3117             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3118                                                // bytes 4 & 5
3119             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3120             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3121             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3122                                                // LBCarrys
3123             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3124                                                // where both lsb's were == 1
3125                                                // (only valid for active group)
3126             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3127             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3128                                                // byte
3129             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3130                                                // for each byte
3131             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3132                                                // bytes to add to Avg
3133             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3134                                                // Avg for each Active byte
3136             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3137             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3138                                                // bytes 6 & 7
3139             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3140             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3141             "addl $8, %%ecx              \n\t"
3142             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3143                                                // LBCarrys
3144             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3145                                                // where both
3146                                                // lsb's were == 1 (only valid
3147                                                // for active group)
3148             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3149             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3150                                                // byte
3151             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3152                                                // for each byte
3153             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3154                                                // bytes to add to Avg
3155             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3156                                                // Avg for each Active byte
3158             "cmpl _MMXLength, %%ecx      \n\t"
3159             // now ready to write back to memory
3160             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3161             // prep Raw(x-bpp) for next loop
3162             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3163             "jb avg_2lp                  \n\t"
3165             : "=S" (dummy_value_S),            // output regs (dummy)
3166               "=D" (dummy_value_D)
3168             : "0" (prev_row),  // esi          // input regs
3169               "1" (row)        // edi
3171             : "%ecx"                           // clobber list
3172 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3173             , "%mm0", "%mm1", "%mm2", "%mm3"
3174             , "%mm4", "%mm5", "%mm6", "%mm7"
3175 #endif
3176          );
3177       }
3178       break;  // end 2 bpp
3180       case 1:
3181       {
3182          __asm__ __volatile__ (
3183             // re-init address pointers and offset
3184 #ifdef __PIC__
3185             "pushl %%ebx                 \n\t" // save Global Offset Table index
3186 #endif
3187             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment
3188                                                // boundary
3189 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3190             "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3191             "jnb avg_1end                \n\t"
3192             // do Paeth decode for remaining bytes
3193 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3194             "movl %%edi, %%edx           \n\t"
3195 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3196             "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3197             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
3198                                                //  in loop below
3199          "avg_1lp:                       \n\t"
3200             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3201             "xorl %%eax, %%eax           \n\t"
3202             "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3203             "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3204             "addw %%cx, %%ax             \n\t"
3205             "incl %%ebx                  \n\t"
3206             "shrw %%ax                   \n\t" // divide by 2
3207             "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3208                                                // inc ebx
3209             "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3210             "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3211                          // mov does not affect flags; -1 to offset inc ebx
3212             "jb avg_1lp                  \n\t"
3214          "avg_1end:                      \n\t"
3215 #ifdef __PIC__
3216             "popl %%ebx                  \n\t" // Global Offset Table index
3217 #endif
3219             : "=c" (dummy_value_c),            // output regs (dummy)
3220               "=S" (dummy_value_S),
3221               "=D" (dummy_value_D)
3223             : "0" (bpp),       // ecx          // input regs
3224               "1" (prev_row),  // esi
3225               "2" (row)        // edi
3227             : "%eax", "%edx"                   // clobber list
3228 #ifndef __PIC__
3229             , "%ebx"
3230 #endif
3231          );
3232       }
3233       return;  // end 1 bpp
3235       case 8:
3236       {
3237          __asm__ __volatile__ (
3238             // re-init address pointers and offset
3239             "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
3240             "movq _LBCarryMask, %%mm5    \n\t" //            boundary
3241 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3242             "movq _HBClearMask, %%mm4    \n\t"
3243 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3245             // prime the pump:  load the first Raw(x-bpp) data set
3246             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3247                                       // (NO NEED to correct pos. in loop below)
3249          "avg_8lp:                       \n\t"
3250             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3251             "movq %%mm5, %%mm3           \n\t"
3252             "movq (%%esi,%%ecx,), %%mm1  \n\t"
3253             "addl $8, %%ecx              \n\t"
3254             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3255             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3256             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3257                                                //  where both lsb's were == 1
3258             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3259             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
3260             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
3261             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
3262             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
3263             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3264             "cmpl _MMXLength, %%ecx      \n\t"
3265             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3266             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
3267             "jb avg_8lp                  \n\t"
3269             : "=S" (dummy_value_S),            // output regs (dummy)
3270               "=D" (dummy_value_D)
3272             : "0" (prev_row),  // esi          // input regs
3273               "1" (row)        // edi
3275             : "%ecx"                           // clobber list
3276 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3277             , "%mm0", "%mm1", "%mm2"
3278             , "%mm3", "%mm4", "%mm5"
3279 #endif
3280          );
3281       }
3282       break;  // end 8 bpp
3284       default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3285       {
3287 #ifdef PNG_DEBUG
3288          // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
3289         png_debug(1,
3290         "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3291 #endif
3293 #if 0
3294         __asm__ __volatile__ (
3295             "movq _LBCarryMask, %%mm5    \n\t"
3296             // re-init address pointers and offset
3297             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to
3298                                                // alignment boundary
3299             "movl row, %%edi             \n\t" // edi:  Avg(x)
3300             "movq _HBClearMask, %%mm4    \n\t"
3301             "movl %%edi, %%edx           \n\t"
3302             "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3303             "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
3304          "avg_Alp:                       \n\t"
3305             "movq (%%edi,%%ebx,), %%mm0  \n\t"
3306             "movq %%mm5, %%mm3           \n\t"
3307             "movq (%%esi,%%ebx,), %%mm1  \n\t"
3308             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3309             "movq (%%edx,%%ebx,), %%mm2  \n\t"
3310             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3311             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3312                                                // where both lsb's were == 1
3313             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3314             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3315                                                // byte
3316             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each
3317                                                // byte
3318             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3319                                                // byte
3320             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3321                                                // each byte
3322             "addl $8, %%ebx              \n\t"
3323             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3324                                                // byte
3325             "cmpl _MMXLength, %%ebx      \n\t"
3326             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3327             "jb avg_Alp                  \n\t"
3329             : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
3331             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
3333             : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3334          );
3335 #endif /* 0 - NEVER REACHED */
3336       }
3337       break;
3339    } // end switch (bpp)
3341    __asm__ __volatile__ (
3342       // MMX acceleration complete; now do clean-up
3343       // check if any remaining bytes left to decode
3344 #ifdef __PIC__
3345       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3346 #endif
3347       "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
3348 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
3349       "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3350       "jnb avg_end                 \n\t"
3352       // do Avg decode for remaining bytes
3353 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3354       "movl %%edi, %%edx           \n\t"
3355 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3356       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3357       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
3359    "avg_lp2:                       \n\t"
3360       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3361       "xorl %%eax, %%eax           \n\t"
3362       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3363       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3364       "addw %%cx, %%ax             \n\t"
3365       "incl %%ebx                  \n\t"
3366       "shrw %%ax                   \n\t" // divide by 2
3367       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3368       "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3369       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3370       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
3372    "avg_end:                       \n\t"
3373       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
3374 #ifdef __PIC__
3375       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3376 #endif
3378       : "=c" (dummy_value_c),            // output regs (dummy)
3379         "=S" (dummy_value_S),
3380         "=D" (dummy_value_D)
3382       : "0" (bpp),       // ecx          // input regs
3383         "1" (prev_row),  // esi
3384         "2" (row)        // edi
3386       : "%eax", "%edx"                   // clobber list
3387 #ifndef __PIC__
3388       , "%ebx"
3389 #endif
3390    );
3392 } /* end png_read_filter_row_mmx_avg() */
3393 #endif
3397 #ifdef PNG_THREAD_UNSAFE_OK
3398 //===========================================================================//
3399 //                                                                           //
3400 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
3401 //                                                                           //
3402 //===========================================================================//
3404 // Optimized code for PNG Paeth filter decoder
3406 static void /* PRIVATE */
3407 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3408                               png_bytep prev_row)
3410    int bpp;
3411    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
3412    int dummy_value_S;
3413    int dummy_value_D;
3415    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3416    _FullLength  = row_info->rowbytes; // # of bytes to filter
3418    __asm__ __volatile__ (
3419 #ifdef __PIC__
3420       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3421 #endif
3422       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
3423 //pre "movl row, %%edi             \n\t"
3424       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
3425 //pre "movl prev_row, %%esi        \n\t"
3426       "xorl %%eax, %%eax           \n\t"
3428       // Compute the Raw value for the first bpp bytes
3429       // Note: the formula works out to be always
3430       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
3431    "paeth_rlp:                     \n\t"
3432       "movb (%%edi,%%ebx,), %%al   \n\t"
3433       "addb (%%esi,%%ebx,), %%al   \n\t"
3434       "incl %%ebx                  \n\t"
3435 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
3436       "cmpl %%ecx, %%ebx           \n\t"
3437       "movb %%al, -1(%%edi,%%ebx,) \n\t"
3438       "jb paeth_rlp                \n\t"
3439       // get # of bytes to alignment
3440       "movl %%edi, _dif            \n\t" // take start of row
3441       "addl %%ebx, _dif            \n\t" // add bpp
3442       "xorl %%ecx, %%ecx           \n\t"
3443       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment
3444                                          // boundary
3445       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
3446       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx
3447                                          // at alignment
3448       "jz paeth_go                 \n\t"
3449       // fix alignment
3451    "paeth_lp1:                     \n\t"
3452       "xorl %%eax, %%eax           \n\t"
3453       // pav = p - a = (a + b - c) - a = b - c
3454       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
3455       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3456       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3457       "movl %%eax, _patemp         \n\t" // Save pav for later use
3458       "xorl %%eax, %%eax           \n\t"
3459       // pbv = p - b = (a + b - c) - b = a - c
3460       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
3461       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3462       "movl %%eax, %%ecx           \n\t"
3463       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3464       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
3465       // pc = abs(pcv)
3466       "testl $0x80000000, %%eax    \n\t"
3467       "jz paeth_pca                \n\t"
3468       "negl %%eax                  \n\t" // reverse sign of neg values
3470    "paeth_pca:                     \n\t"
3471       "movl %%eax, _pctemp         \n\t" // save pc for later use
3472       // pb = abs(pbv)
3473       "testl $0x80000000, %%ecx    \n\t"
3474       "jz paeth_pba                \n\t"
3475       "negl %%ecx                  \n\t" // reverse sign of neg values
3477    "paeth_pba:                     \n\t"
3478       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
3479       // pa = abs(pav)
3480       "movl _patemp, %%eax         \n\t"
3481       "testl $0x80000000, %%eax    \n\t"
3482       "jz paeth_paa                \n\t"
3483       "negl %%eax                  \n\t" // reverse sign of neg values
3485    "paeth_paa:                     \n\t"
3486       "movl %%eax, _patemp         \n\t" // save pa for later use
3487       // test if pa <= pb
3488       "cmpl %%ecx, %%eax           \n\t"
3489       "jna paeth_abb               \n\t"
3490       // pa > pb; now test if pb <= pc
3491       "cmpl _pctemp, %%ecx         \n\t"
3492       "jna paeth_bbc               \n\t"
3493       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3494       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3495       "jmp paeth_paeth             \n\t"
3497    "paeth_bbc:                     \n\t"
3498       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3499       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
3500       "jmp paeth_paeth             \n\t"
3502    "paeth_abb:                     \n\t"
3503       // pa <= pb; now test if pa <= pc
3504       "cmpl _pctemp, %%eax         \n\t"
3505       "jna paeth_abc               \n\t"
3506       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3507       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3508       "jmp paeth_paeth             \n\t"
3510    "paeth_abc:                     \n\t"
3511       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3512       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
3514    "paeth_paeth:                   \n\t"
3515       "incl %%ebx                  \n\t"
3516       "incl %%edx                  \n\t"
3517       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3518       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3519       "cmpl _dif, %%ebx            \n\t"
3520       "jb paeth_lp1                \n\t"
3522    "paeth_go:                      \n\t"
3523       "movl _FullLength, %%ecx     \n\t"
3524       "movl %%ecx, %%eax           \n\t"
3525       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
3526       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
3527       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
3528       "movl %%ecx, _MMXLength      \n\t"
3529 #ifdef __PIC__
3530       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3531 #endif
3533       : "=c" (dummy_value_c),            // output regs (dummy)
3534         "=S" (dummy_value_S),
3535         "=D" (dummy_value_D)
3537       : "0" (bpp),       // ecx          // input regs
3538         "1" (prev_row),  // esi
3539         "2" (row)        // edi
3541       : "%eax", "%edx"                   // clobber list
3542 #ifndef __PIC__
3543       , "%ebx"
3544 #endif
3545    );
3547    // now do the math for the rest of the row
3548    switch (bpp)
3549    {
3550       case 3:
3551       {
3552          _ActiveMask.use = 0x0000000000ffffffLL;
3553          _ActiveMaskEnd.use = 0xffff000000000000LL;
3554          _ShiftBpp.use = 24;    // == bpp(3) * 8
3555          _ShiftRem.use = 40;    // == 64 - 24
3557          __asm__ __volatile__ (
3558             "movl _dif, %%ecx            \n\t"
3559 // preload  "movl row, %%edi             \n\t"
3560 // preload  "movl prev_row, %%esi        \n\t"
3561             "pxor %%mm0, %%mm0           \n\t"
3562             // prime the pump:  load the first Raw(x-bpp) data set
3563             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3564          "paeth_3lp:                     \n\t"
3565             "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st
3566                                                // 3 bytes
3567             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3568             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3569             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3570             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3571             "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st
3572                                                // 3 bytes
3573             // pav = p - a = (a + b - c) - a = b - c
3574             "movq %%mm2, %%mm4           \n\t"
3575             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3576             // pbv = p - b = (a + b - c) - b = a - c
3577             "movq %%mm1, %%mm5           \n\t"
3578             "psubw %%mm3, %%mm4          \n\t"
3579             "pxor %%mm7, %%mm7           \n\t"
3580             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3581             "movq %%mm4, %%mm6           \n\t"
3582             "psubw %%mm3, %%mm5          \n\t"
3584             // pa = abs(p-a) = abs(pav)
3585             // pb = abs(p-b) = abs(pbv)
3586             // pc = abs(p-c) = abs(pcv)
3587             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3588             "paddw %%mm5, %%mm6          \n\t"
3589             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3590             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3591             "psubw %%mm0, %%mm4          \n\t"
3592             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3593             "psubw %%mm0, %%mm4          \n\t"
3594             "psubw %%mm7, %%mm5          \n\t"
3595             "pxor %%mm0, %%mm0           \n\t"
3596             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3597             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3598             "psubw %%mm7, %%mm5          \n\t"
3599             "psubw %%mm0, %%mm6          \n\t"
3600             //  test pa <= pb
3601             "movq %%mm4, %%mm7           \n\t"
3602             "psubw %%mm0, %%mm6          \n\t"
3603             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3604             "movq %%mm7, %%mm0           \n\t"
3605             // use mm7 mask to merge pa & pb
3606             "pand %%mm7, %%mm5           \n\t"
3607             // use mm0 mask copy to merge a & b
3608             "pand %%mm0, %%mm2           \n\t"
3609             "pandn %%mm4, %%mm7          \n\t"
3610             "pandn %%mm1, %%mm0          \n\t"
3611             "paddw %%mm5, %%mm7          \n\t"
3612             "paddw %%mm2, %%mm0          \n\t"
3613             //  test  ((pa <= pb)? pa:pb) <= pc
3614             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3615             "pxor %%mm1, %%mm1           \n\t"
3616             "pand %%mm7, %%mm3           \n\t"
3617             "pandn %%mm0, %%mm7          \n\t"
3618             "paddw %%mm3, %%mm7          \n\t"
3619             "pxor %%mm0, %%mm0           \n\t"
3620             "packuswb %%mm1, %%mm7       \n\t"
3621             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3622             "pand _ActiveMask, %%mm7     \n\t"
3623             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
3624             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3625             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3626             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3627             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
3628                                                // Raw(x-bpp)
3629             // now do Paeth for 2nd set of bytes (3-5)
3630             "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
3631             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3632             "pxor %%mm7, %%mm7           \n\t"
3633             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3634             // pbv = p - b = (a + b - c) - b = a - c
3635             "movq %%mm1, %%mm5           \n\t"
3636             // pav = p - a = (a + b - c) - a = b - c
3637             "movq %%mm2, %%mm4           \n\t"
3638             "psubw %%mm3, %%mm5          \n\t"
3639             "psubw %%mm3, %%mm4          \n\t"
3640             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3641             //       pav + pbv = pbv + pav
3642             "movq %%mm5, %%mm6           \n\t"
3643             "paddw %%mm4, %%mm6          \n\t"
3645             // pa = abs(p-a) = abs(pav)
3646             // pb = abs(p-b) = abs(pbv)
3647             // pc = abs(p-c) = abs(pcv)
3648             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
3649             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
3650             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
3651             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
3652             "psubw %%mm0, %%mm5          \n\t"
3653             "psubw %%mm7, %%mm4          \n\t"
3654             "psubw %%mm0, %%mm5          \n\t"
3655             "psubw %%mm7, %%mm4          \n\t"
3656             "pxor %%mm0, %%mm0           \n\t"
3657             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3658             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3659             "psubw %%mm0, %%mm6          \n\t"
3660             //  test pa <= pb
3661             "movq %%mm4, %%mm7           \n\t"
3662             "psubw %%mm0, %%mm6          \n\t"
3663             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3664             "movq %%mm7, %%mm0           \n\t"
3665             // use mm7 mask to merge pa & pb
3666             "pand %%mm7, %%mm5           \n\t"
3667             // use mm0 mask copy to merge a & b
3668             "pand %%mm0, %%mm2           \n\t"
3669             "pandn %%mm4, %%mm7          \n\t"
3670             "pandn %%mm1, %%mm0          \n\t"
3671             "paddw %%mm5, %%mm7          \n\t"
3672             "paddw %%mm2, %%mm0          \n\t"
3673             //  test  ((pa <= pb)? pa:pb) <= pc
3674             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3675             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3676             "pand %%mm7, %%mm3           \n\t"
3677             "pandn %%mm0, %%mm7          \n\t"
3678             "pxor %%mm1, %%mm1           \n\t"
3679             "paddw %%mm3, %%mm7          \n\t"
3680             "pxor %%mm0, %%mm0           \n\t"
3681             "packuswb %%mm1, %%mm7       \n\t"
3682             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
3683             "pand _ActiveMask, %%mm7     \n\t"
3684             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3685             "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of
3686                                                // 3 bytes
3687              // pav = p - a = (a + b - c) - a = b - c
3688             "movq %%mm2, %%mm4           \n\t"
3689             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3690             "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
3691             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3692             "movq %%mm7, %%mm1           \n\t"
3693             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3694             "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
3695                                     // now mm1 will be used as Raw(x-bpp)
3696             // now do Paeth for 3rd, and final, set of bytes (6-7)
3697             "pxor %%mm7, %%mm7           \n\t"
3698             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3699             "psubw %%mm3, %%mm4          \n\t"
3700             // pbv = p - b = (a + b - c) - b = a - c
3701             "movq %%mm1, %%mm5           \n\t"
3702             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3703             "movq %%mm4, %%mm6           \n\t"
3704             "psubw %%mm3, %%mm5          \n\t"
3705             "pxor %%mm0, %%mm0           \n\t"
3706             "paddw %%mm5, %%mm6          \n\t"
3708             // pa = abs(p-a) = abs(pav)
3709             // pb = abs(p-b) = abs(pbv)
3710             // pc = abs(p-c) = abs(pcv)
3711             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3712             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3713             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3714             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3715             "psubw %%mm0, %%mm4          \n\t"
3716             "psubw %%mm7, %%mm5          \n\t"
3717             "psubw %%mm0, %%mm4          \n\t"
3718             "psubw %%mm7, %%mm5          \n\t"
3719             "pxor %%mm0, %%mm0           \n\t"
3720             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3721             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3722             "psubw %%mm0, %%mm6          \n\t"
3723             //  test pa <= pb
3724             "movq %%mm4, %%mm7           \n\t"
3725             "psubw %%mm0, %%mm6          \n\t"
3726             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3727             "movq %%mm7, %%mm0           \n\t"
3728             // use mm0 mask copy to merge a & b
3729             "pand %%mm0, %%mm2           \n\t"
3730             // use mm7 mask to merge pa & pb
3731             "pand %%mm7, %%mm5           \n\t"
3732             "pandn %%mm1, %%mm0          \n\t"
3733             "pandn %%mm4, %%mm7          \n\t"
3734             "paddw %%mm2, %%mm0          \n\t"
3735             "paddw %%mm5, %%mm7          \n\t"
3736             //  test  ((pa <= pb)? pa:pb) <= pc
3737             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3738             "pand %%mm7, %%mm3           \n\t"
3739             "pandn %%mm0, %%mm7          \n\t"
3740             "paddw %%mm3, %%mm7          \n\t"
3741             "pxor %%mm1, %%mm1           \n\t"
3742             "packuswb %%mm7, %%mm1       \n\t"
3743             // step ecx to next set of 8 bytes and repeat loop til done
3744             "addl $8, %%ecx              \n\t"
3745             "pand _ActiveMaskEnd, %%mm1  \n\t"
3746             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3747                                                  // Raw(x)
3749             "cmpl _MMXLength, %%ecx      \n\t"
3750             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
3751             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3752                                  // mm1 will be used as Raw(x-bpp) next loop
3753                            // mm3 ready to be used as Prior(x-bpp) next loop
3754             "jb paeth_3lp                \n\t"
3756             : "=S" (dummy_value_S),             // output regs (dummy)
3757               "=D" (dummy_value_D)
3759             : "0" (prev_row),  // esi           // input regs
3760               "1" (row)        // edi
3762             : "%ecx"                            // clobber list
3763 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3764             , "%mm0", "%mm1", "%mm2", "%mm3"
3765             , "%mm4", "%mm5", "%mm6", "%mm7"
3766 #endif
3767          );
3768       }
3769       break;  // end 3 bpp
3771       case 6:
3772       //case 7:   // GRR BOGUS
3773       //case 5:   // GRR BOGUS
3774       {
3775          _ActiveMask.use  = 0x00000000ffffffffLL;
3776          _ActiveMask2.use = 0xffffffff00000000LL;
3777          _ShiftBpp.use = bpp << 3;    // == bpp * 8
3778          _ShiftRem.use = 64 - _ShiftBpp.use;
3780          __asm__ __volatile__ (
3781             "movl _dif, %%ecx            \n\t"
3782 // preload  "movl row, %%edi             \n\t"
3783 // preload  "movl prev_row, %%esi        \n\t"
3784             // prime the pump:  load the first Raw(x-bpp) data set
3785             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3786             "pxor %%mm0, %%mm0           \n\t"
3788          "paeth_6lp:                     \n\t"
3789             // must shift to position Raw(x-bpp) data
3790             "psrlq _ShiftRem, %%mm1      \n\t"
3791             // do first set of 4 bytes
3792             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3793             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3794             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3795             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3796             // must shift to position Prior(x-bpp) data
3797             "psrlq _ShiftRem, %%mm3      \n\t"
3798             // pav = p - a = (a + b - c) - a = b - c
3799             "movq %%mm2, %%mm4           \n\t"
3800             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
3801             // pbv = p - b = (a + b - c) - b = a - c
3802             "movq %%mm1, %%mm5           \n\t"
3803             "psubw %%mm3, %%mm4          \n\t"
3804             "pxor %%mm7, %%mm7           \n\t"
3805             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3806             "movq %%mm4, %%mm6           \n\t"
3807             "psubw %%mm3, %%mm5          \n\t"
3808             // pa = abs(p-a) = abs(pav)
3809             // pb = abs(p-b) = abs(pbv)
3810             // pc = abs(p-c) = abs(pcv)
3811             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3812             "paddw %%mm5, %%mm6          \n\t"
3813             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3814             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3815             "psubw %%mm0, %%mm4          \n\t"
3816             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3817             "psubw %%mm0, %%mm4          \n\t"
3818             "psubw %%mm7, %%mm5          \n\t"
3819             "pxor %%mm0, %%mm0           \n\t"
3820             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3821             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3822             "psubw %%mm7, %%mm5          \n\t"
3823             "psubw %%mm0, %%mm6          \n\t"
3824             //  test pa <= pb
3825             "movq %%mm4, %%mm7           \n\t"
3826             "psubw %%mm0, %%mm6          \n\t"
3827             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3828             "movq %%mm7, %%mm0           \n\t"
3829             // use mm7 mask to merge pa & pb
3830             "pand %%mm7, %%mm5           \n\t"
3831             // use mm0 mask copy to merge a & b
3832             "pand %%mm0, %%mm2           \n\t"
3833             "pandn %%mm4, %%mm7          \n\t"
3834             "pandn %%mm1, %%mm0          \n\t"
3835             "paddw %%mm5, %%mm7          \n\t"
3836             "paddw %%mm2, %%mm0          \n\t"
3837             //  test  ((pa <= pb)? pa:pb) <= pc
3838             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3839             "pxor %%mm1, %%mm1           \n\t"
3840             "pand %%mm7, %%mm3           \n\t"
3841             "pandn %%mm0, %%mm7          \n\t"
3842             "paddw %%mm3, %%mm7          \n\t"
3843             "pxor %%mm0, %%mm0           \n\t"
3844             "packuswb %%mm1, %%mm7       \n\t"
3845             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3846             "pand _ActiveMask, %%mm7     \n\t"
3847             "psrlq _ShiftRem, %%mm3      \n\t"
3848             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
3849             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3850             "movq %%mm2, %%mm6           \n\t"
3851             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3852             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3853             "psllq _ShiftBpp, %%mm6      \n\t"
3854             "movq %%mm7, %%mm5           \n\t"
3855             "psrlq _ShiftRem, %%mm1      \n\t"
3856             "por %%mm6, %%mm3            \n\t"
3857             "psllq _ShiftBpp, %%mm5      \n\t"
3858             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3859             "por %%mm5, %%mm1            \n\t"
3860             // do second set of 4 bytes
3861             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3862             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3863             // pav = p - a = (a + b - c) - a = b - c
3864             "movq %%mm2, %%mm4           \n\t"
3865             // pbv = p - b = (a + b - c) - b = a - c
3866             "movq %%mm1, %%mm5           \n\t"
3867             "psubw %%mm3, %%mm4          \n\t"
3868             "pxor %%mm7, %%mm7           \n\t"
3869             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3870             "movq %%mm4, %%mm6           \n\t"
3871             "psubw %%mm3, %%mm5          \n\t"
3872             // pa = abs(p-a) = abs(pav)
3873             // pb = abs(p-b) = abs(pbv)
3874             // pc = abs(p-c) = abs(pcv)
3875             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3876             "paddw %%mm5, %%mm6          \n\t"
3877             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3878             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3879             "psubw %%mm0, %%mm4          \n\t"
3880             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3881             "psubw %%mm0, %%mm4          \n\t"
3882             "psubw %%mm7, %%mm5          \n\t"
3883             "pxor %%mm0, %%mm0           \n\t"
3884             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3885             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3886             "psubw %%mm7, %%mm5          \n\t"
3887             "psubw %%mm0, %%mm6          \n\t"
3888             //  test pa <= pb
3889             "movq %%mm4, %%mm7           \n\t"
3890             "psubw %%mm0, %%mm6          \n\t"
3891             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3892             "movq %%mm7, %%mm0           \n\t"
3893             // use mm7 mask to merge pa & pb
3894             "pand %%mm7, %%mm5           \n\t"
3895             // use mm0 mask copy to merge a & b
3896             "pand %%mm0, %%mm2           \n\t"
3897             "pandn %%mm4, %%mm7          \n\t"
3898             "pandn %%mm1, %%mm0          \n\t"
3899             "paddw %%mm5, %%mm7          \n\t"
3900             "paddw %%mm2, %%mm0          \n\t"
3901             //  test  ((pa <= pb)? pa:pb) <= pc
3902             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3903             "pxor %%mm1, %%mm1           \n\t"
3904             "pand %%mm7, %%mm3           \n\t"
3905             "pandn %%mm0, %%mm7          \n\t"
3906             "pxor %%mm1, %%mm1           \n\t"
3907             "paddw %%mm3, %%mm7          \n\t"
3908             "pxor %%mm0, %%mm0           \n\t"
3909             // step ecx to next set of 8 bytes and repeat loop til done
3910             "addl $8, %%ecx              \n\t"
3911             "packuswb %%mm7, %%mm1       \n\t"
3912             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3913             "cmpl _MMXLength, %%ecx      \n\t"
3914             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3915                                 // mm1 will be used as Raw(x-bpp) next loop
3916             "jb paeth_6lp                \n\t"
3918             : "=S" (dummy_value_S),             // output regs (dummy)
3919               "=D" (dummy_value_D)
3921             : "0" (prev_row),  // esi           // input regs
3922               "1" (row)        // edi
3924             : "%ecx"                            // clobber list
3925 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3926             , "%mm0", "%mm1", "%mm2", "%mm3"
3927             , "%mm4", "%mm5", "%mm6", "%mm7"
3928 #endif
3929          );
3930       }
3931       break;  // end 6 bpp
3933       case 4:
3934       {
3935          _ActiveMask.use  = 0x00000000ffffffffLL;
3937          __asm__ __volatile__ (
3938             "movl _dif, %%ecx            \n\t"
3939 // preload  "movl row, %%edi             \n\t"
3940 // preload  "movl prev_row, %%esi        \n\t"
3941             "pxor %%mm0, %%mm0           \n\t"
3942             // prime the pump:  load the first Raw(x-bpp) data set
3943             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3944                                      //  a=Raw(x-bpp) bytes
3945          "paeth_4lp:                     \n\t"
3946             // do first set of 4 bytes
3947             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3948             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3949             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3950             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3951             // pav = p - a = (a + b - c) - a = b - c
3952             "movq %%mm2, %%mm4           \n\t"
3953             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3954             // pbv = p - b = (a + b - c) - b = a - c
3955             "movq %%mm1, %%mm5           \n\t"
3956             "psubw %%mm3, %%mm4          \n\t"
3957             "pxor %%mm7, %%mm7           \n\t"
3958             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3959             "movq %%mm4, %%mm6           \n\t"
3960             "psubw %%mm3, %%mm5          \n\t"
3961             // pa = abs(p-a) = abs(pav)
3962             // pb = abs(p-b) = abs(pbv)
3963             // pc = abs(p-c) = abs(pcv)
3964             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3965             "paddw %%mm5, %%mm6          \n\t"
3966             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3967             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3968             "psubw %%mm0, %%mm4          \n\t"
3969             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3970             "psubw %%mm0, %%mm4          \n\t"
3971             "psubw %%mm7, %%mm5          \n\t"
3972             "pxor %%mm0, %%mm0           \n\t"
3973             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3974             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3975             "psubw %%mm7, %%mm5          \n\t"
3976             "psubw %%mm0, %%mm6          \n\t"
3977             //  test pa <= pb
3978             "movq %%mm4, %%mm7           \n\t"
3979             "psubw %%mm0, %%mm6          \n\t"
3980             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3981             "movq %%mm7, %%mm0           \n\t"
3982             // use mm7 mask to merge pa & pb
3983             "pand %%mm7, %%mm5           \n\t"
3984             // use mm0 mask copy to merge a & b
3985             "pand %%mm0, %%mm2           \n\t"
3986             "pandn %%mm4, %%mm7          \n\t"
3987             "pandn %%mm1, %%mm0          \n\t"
3988             "paddw %%mm5, %%mm7          \n\t"
3989             "paddw %%mm2, %%mm0          \n\t"
3990             //  test  ((pa <= pb)? pa:pb) <= pc
3991             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3992             "pxor %%mm1, %%mm1           \n\t"
3993             "pand %%mm7, %%mm3           \n\t"
3994             "pandn %%mm0, %%mm7          \n\t"
3995             "paddw %%mm3, %%mm7          \n\t"
3996             "pxor %%mm0, %%mm0           \n\t"
3997             "packuswb %%mm1, %%mm7       \n\t"
3998             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3999             "pand _ActiveMask, %%mm7     \n\t"
4000             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
4001             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4002             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4003             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
4004             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
4005             // do second set of 4 bytes
4006             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4007             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4008             // pav = p - a = (a + b - c) - a = b - c
4009             "movq %%mm2, %%mm4           \n\t"
4010             // pbv = p - b = (a + b - c) - b = a - c
4011             "movq %%mm1, %%mm5           \n\t"
4012             "psubw %%mm3, %%mm4          \n\t"
4013             "pxor %%mm7, %%mm7           \n\t"
4014             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4015             "movq %%mm4, %%mm6           \n\t"
4016             "psubw %%mm3, %%mm5          \n\t"
4017             // pa = abs(p-a) = abs(pav)
4018             // pb = abs(p-b) = abs(pbv)
4019             // pc = abs(p-c) = abs(pcv)
4020             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4021             "paddw %%mm5, %%mm6          \n\t"
4022             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4023             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4024             "psubw %%mm0, %%mm4          \n\t"
4025             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4026             "psubw %%mm0, %%mm4          \n\t"
4027             "psubw %%mm7, %%mm5          \n\t"
4028             "pxor %%mm0, %%mm0           \n\t"
4029             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4030             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4031             "psubw %%mm7, %%mm5          \n\t"
4032             "psubw %%mm0, %%mm6          \n\t"
4033             //  test pa <= pb
4034             "movq %%mm4, %%mm7           \n\t"
4035             "psubw %%mm0, %%mm6          \n\t"
4036             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4037             "movq %%mm7, %%mm0           \n\t"
4038             // use mm7 mask to merge pa & pb
4039             "pand %%mm7, %%mm5           \n\t"
4040             // use mm0 mask copy to merge a & b
4041             "pand %%mm0, %%mm2           \n\t"
4042             "pandn %%mm4, %%mm7          \n\t"
4043             "pandn %%mm1, %%mm0          \n\t"
4044             "paddw %%mm5, %%mm7          \n\t"
4045             "paddw %%mm2, %%mm0          \n\t"
4046             //  test  ((pa <= pb)? pa:pb) <= pc
4047             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4048             "pxor %%mm1, %%mm1           \n\t"
4049             "pand %%mm7, %%mm3           \n\t"
4050             "pandn %%mm0, %%mm7          \n\t"
4051             "pxor %%mm1, %%mm1           \n\t"
4052             "paddw %%mm3, %%mm7          \n\t"
4053             "pxor %%mm0, %%mm0           \n\t"
4054             // step ecx to next set of 8 bytes and repeat loop til done
4055             "addl $8, %%ecx              \n\t"
4056             "packuswb %%mm7, %%mm1       \n\t"
4057             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4058             "cmpl _MMXLength, %%ecx      \n\t"
4059             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4060                                 // mm1 will be used as Raw(x-bpp) next loop
4061             "jb paeth_4lp                \n\t"
4063             : "=S" (dummy_value_S),             // output regs (dummy)
4064               "=D" (dummy_value_D)
4066             : "0" (prev_row),  // esi           // input regs
4067               "1" (row)        // edi
4069             : "%ecx"                            // clobber list
4070 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4071             , "%mm0", "%mm1", "%mm2", "%mm3"
4072             , "%mm4", "%mm5", "%mm6", "%mm7"
4073 #endif
4074          );
4075       }
4076       break;  // end 4 bpp
4078       case 8:                          // bpp == 8
4079       {
4080          _ActiveMask.use  = 0x00000000ffffffffLL;
4082          __asm__ __volatile__ (
4083             "movl _dif, %%ecx            \n\t"
4084 // preload  "movl row, %%edi             \n\t"
4085 // preload  "movl prev_row, %%esi        \n\t"
4086             "pxor %%mm0, %%mm0           \n\t"
4087             // prime the pump:  load the first Raw(x-bpp) data set
4088             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4089                                        //  a=Raw(x-bpp) bytes
4090          "paeth_8lp:                     \n\t"
4091             // do first set of 4 bytes
4092             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4093             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4094             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
4095             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4096             // pav = p - a = (a + b - c) - a = b - c
4097             "movq %%mm2, %%mm4           \n\t"
4098             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
4099             // pbv = p - b = (a + b - c) - b = a - c
4100             "movq %%mm1, %%mm5           \n\t"
4101             "psubw %%mm3, %%mm4          \n\t"
4102             "pxor %%mm7, %%mm7           \n\t"
4103             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4104             "movq %%mm4, %%mm6           \n\t"
4105             "psubw %%mm3, %%mm5          \n\t"
4106             // pa = abs(p-a) = abs(pav)
4107             // pb = abs(p-b) = abs(pbv)
4108             // pc = abs(p-c) = abs(pcv)
4109             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4110             "paddw %%mm5, %%mm6          \n\t"
4111             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4112             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4113             "psubw %%mm0, %%mm4          \n\t"
4114             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4115             "psubw %%mm0, %%mm4          \n\t"
4116             "psubw %%mm7, %%mm5          \n\t"
4117             "pxor %%mm0, %%mm0           \n\t"
4118             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4119             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4120             "psubw %%mm7, %%mm5          \n\t"
4121             "psubw %%mm0, %%mm6          \n\t"
4122             //  test pa <= pb
4123             "movq %%mm4, %%mm7           \n\t"
4124             "psubw %%mm0, %%mm6          \n\t"
4125             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4126             "movq %%mm7, %%mm0           \n\t"
4127             // use mm7 mask to merge pa & pb
4128             "pand %%mm7, %%mm5           \n\t"
4129             // use mm0 mask copy to merge a & b
4130             "pand %%mm0, %%mm2           \n\t"
4131             "pandn %%mm4, %%mm7          \n\t"
4132             "pandn %%mm1, %%mm0          \n\t"
4133             "paddw %%mm5, %%mm7          \n\t"
4134             "paddw %%mm2, %%mm0          \n\t"
4135             //  test  ((pa <= pb)? pa:pb) <= pc
4136             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4137             "pxor %%mm1, %%mm1           \n\t"
4138             "pand %%mm7, %%mm3           \n\t"
4139             "pandn %%mm0, %%mm7          \n\t"
4140             "paddw %%mm3, %%mm7          \n\t"
4141             "pxor %%mm0, %%mm0           \n\t"
4142             "packuswb %%mm1, %%mm7       \n\t"
4143             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4144             "pand _ActiveMask, %%mm7     \n\t"
4145             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
4146             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4147             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4148             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
4149             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4151             // do second set of 4 bytes
4152             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4153             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4154             // pav = p - a = (a + b - c) - a = b - c
4155             "movq %%mm2, %%mm4           \n\t"
4156             // pbv = p - b = (a + b - c) - b = a - c
4157             "movq %%mm1, %%mm5           \n\t"
4158             "psubw %%mm3, %%mm4          \n\t"
4159             "pxor %%mm7, %%mm7           \n\t"
4160             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4161             "movq %%mm4, %%mm6           \n\t"
4162             "psubw %%mm3, %%mm5          \n\t"
4163             // pa = abs(p-a) = abs(pav)
4164             // pb = abs(p-b) = abs(pbv)
4165             // pc = abs(p-c) = abs(pcv)
4166             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4167             "paddw %%mm5, %%mm6          \n\t"
4168             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4169             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4170             "psubw %%mm0, %%mm4          \n\t"
4171             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4172             "psubw %%mm0, %%mm4          \n\t"
4173             "psubw %%mm7, %%mm5          \n\t"
4174             "pxor %%mm0, %%mm0           \n\t"
4175             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4176             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4177             "psubw %%mm7, %%mm5          \n\t"
4178             "psubw %%mm0, %%mm6          \n\t"
4179             //  test pa <= pb
4180             "movq %%mm4, %%mm7           \n\t"
4181             "psubw %%mm0, %%mm6          \n\t"
4182             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4183             "movq %%mm7, %%mm0           \n\t"
4184             // use mm7 mask to merge pa & pb
4185             "pand %%mm7, %%mm5           \n\t"
4186             // use mm0 mask copy to merge a & b
4187             "pand %%mm0, %%mm2           \n\t"
4188             "pandn %%mm4, %%mm7          \n\t"
4189             "pandn %%mm1, %%mm0          \n\t"
4190             "paddw %%mm5, %%mm7          \n\t"
4191             "paddw %%mm2, %%mm0          \n\t"
4192             //  test  ((pa <= pb)? pa:pb) <= pc
4193             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4194             "pxor %%mm1, %%mm1           \n\t"
4195             "pand %%mm7, %%mm3           \n\t"
4196             "pandn %%mm0, %%mm7          \n\t"
4197             "pxor %%mm1, %%mm1           \n\t"
4198             "paddw %%mm3, %%mm7          \n\t"
4199             "pxor %%mm0, %%mm0           \n\t"
4200             // step ecx to next set of 8 bytes and repeat loop til done
4201             "addl $8, %%ecx              \n\t"
4202             "packuswb %%mm7, %%mm1       \n\t"
4203             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4204             "cmpl _MMXLength, %%ecx      \n\t"
4205             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4206                             // mm1 will be used as Raw(x-bpp) next loop
4207             "jb paeth_8lp                \n\t"
4209             : "=S" (dummy_value_S),             // output regs (dummy)
4210               "=D" (dummy_value_D)
4212             : "0" (prev_row),  // esi           // input regs
4213               "1" (row)        // edi
4215             : "%ecx"                            // clobber list
4216 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4217             , "%mm0", "%mm1", "%mm2", "%mm3"
4218             , "%mm4", "%mm5", "%mm6", "%mm7"
4219 #endif
4220          );
4221       }
4222       break;  // end 8 bpp
4224       case 1:                // bpp = 1
4225       case 2:                // bpp = 2
4226       default:               // bpp > 8
4227       {
4228          __asm__ __volatile__ (
4229 #ifdef __PIC__
4230             "pushl %%ebx                 \n\t" // save Global Offset Table index
4231 #endif
4232             "movl _dif, %%ebx            \n\t"
4233             "cmpl _FullLength, %%ebx     \n\t"
4234             "jnb paeth_dend              \n\t"
4236 // preload  "movl row, %%edi             \n\t"
4237 // preload  "movl prev_row, %%esi        \n\t"
4238             // do Paeth decode for remaining bytes
4239             "movl %%ebx, %%edx           \n\t"
4240 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4241             "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4242             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
4244          "paeth_dlp:                     \n\t"
4245             "xorl %%eax, %%eax           \n\t"
4246             // pav = p - a = (a + b - c) - a = b - c
4247             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4248             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4249             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4250             "movl %%eax, _patemp         \n\t" // Save pav for later use
4251             "xorl %%eax, %%eax           \n\t"
4252             // pbv = p - b = (a + b - c) - b = a - c
4253             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4254             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4255             "movl %%eax, %%ecx           \n\t"
4256             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4257             "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4258             // pc = abs(pcv)
4259             "testl $0x80000000, %%eax    \n\t"
4260             "jz paeth_dpca               \n\t"
4261             "negl %%eax                  \n\t" // reverse sign of neg values
4263          "paeth_dpca:                    \n\t"
4264             "movl %%eax, _pctemp         \n\t" // save pc for later use
4265             // pb = abs(pbv)
4266             "testl $0x80000000, %%ecx    \n\t"
4267             "jz paeth_dpba               \n\t"
4268             "negl %%ecx                  \n\t" // reverse sign of neg values
4270          "paeth_dpba:                    \n\t"
4271             "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4272             // pa = abs(pav)
4273             "movl _patemp, %%eax         \n\t"
4274             "testl $0x80000000, %%eax    \n\t"
4275             "jz paeth_dpaa               \n\t"
4276             "negl %%eax                  \n\t" // reverse sign of neg values
4278          "paeth_dpaa:                    \n\t"
4279             "movl %%eax, _patemp         \n\t" // save pa for later use
4280             // test if pa <= pb
4281             "cmpl %%ecx, %%eax           \n\t"
4282             "jna paeth_dabb              \n\t"
4283             // pa > pb; now test if pb <= pc
4284             "cmpl _pctemp, %%ecx         \n\t"
4285             "jna paeth_dbbc              \n\t"
4286             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4287             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4288             "jmp paeth_dpaeth            \n\t"
4290          "paeth_dbbc:                    \n\t"
4291             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4292             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4293             "jmp paeth_dpaeth            \n\t"
4295          "paeth_dabb:                    \n\t"
4296             // pa <= pb; now test if pa <= pc
4297             "cmpl _pctemp, %%eax         \n\t"
4298             "jna paeth_dabc              \n\t"
4299             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4300             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4301             "jmp paeth_dpaeth            \n\t"
4303          "paeth_dabc:                    \n\t"
4304             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4305             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4307          "paeth_dpaeth:                  \n\t"
4308             "incl %%ebx                  \n\t"
4309             "incl %%edx                  \n\t"
4310             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4311             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4312             "cmpl _FullLength, %%ebx     \n\t"
4313             "jb paeth_dlp                \n\t"
4315          "paeth_dend:                    \n\t"
4316 #ifdef __PIC__
4317             "popl %%ebx                  \n\t" // index to Global Offset Table
4318 #endif
4320             : "=c" (dummy_value_c),            // output regs (dummy)
4321               "=S" (dummy_value_S),
4322               "=D" (dummy_value_D)
4324             : "0" (bpp),       // ecx          // input regs
4325               "1" (prev_row),  // esi
4326               "2" (row)        // edi
4328             : "%eax", "%edx"                   // clobber list
4329 #ifndef __PIC__
4330             , "%ebx"
4331 #endif
4332          );
4333       }
4334       return;                   // No need to go further with this one
4336    } // end switch (bpp)
4338    __asm__ __volatile__ (
4339       // MMX acceleration complete; now do clean-up
4340       // check if any remaining bytes left to decode
4341 #ifdef __PIC__
4342       "pushl %%ebx                 \n\t" // save index to Global Offset Table
4343 #endif
4344       "movl _MMXLength, %%ebx      \n\t"
4345       "cmpl _FullLength, %%ebx     \n\t"
4346       "jnb paeth_end               \n\t"
4347 //pre "movl row, %%edi             \n\t"
4348 //pre "movl prev_row, %%esi        \n\t"
4349       // do Paeth decode for remaining bytes
4350       "movl %%ebx, %%edx           \n\t"
4351 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4352       "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4353       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
4355    "paeth_lp2:                     \n\t"
4356       "xorl %%eax, %%eax           \n\t"
4357       // pav = p - a = (a + b - c) - a = b - c
4358       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4359       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4360       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4361       "movl %%eax, _patemp         \n\t" // Save pav for later use
4362       "xorl %%eax, %%eax           \n\t"
4363       // pbv = p - b = (a + b - c) - b = a - c
4364       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4365       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4366       "movl %%eax, %%ecx           \n\t"
4367       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4368       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4369       // pc = abs(pcv)
4370       "testl $0x80000000, %%eax    \n\t"
4371       "jz paeth_pca2               \n\t"
4372       "negl %%eax                  \n\t" // reverse sign of neg values
4374    "paeth_pca2:                    \n\t"
4375       "movl %%eax, _pctemp         \n\t" // save pc for later use
4376       // pb = abs(pbv)
4377       "testl $0x80000000, %%ecx    \n\t"
4378       "jz paeth_pba2               \n\t"
4379       "negl %%ecx                  \n\t" // reverse sign of neg values
4381    "paeth_pba2:                    \n\t"
4382       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4383       // pa = abs(pav)
4384       "movl _patemp, %%eax         \n\t"
4385       "testl $0x80000000, %%eax    \n\t"
4386       "jz paeth_paa2               \n\t"
4387       "negl %%eax                  \n\t" // reverse sign of neg values
4389    "paeth_paa2:                    \n\t"
4390       "movl %%eax, _patemp         \n\t" // save pa for later use
4391       // test if pa <= pb
4392       "cmpl %%ecx, %%eax           \n\t"
4393       "jna paeth_abb2              \n\t"
4394       // pa > pb; now test if pb <= pc
4395       "cmpl _pctemp, %%ecx         \n\t"
4396       "jna paeth_bbc2              \n\t"
4397       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4398       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4399       "jmp paeth_paeth2            \n\t"
4401    "paeth_bbc2:                    \n\t"
4402       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4403       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4404       "jmp paeth_paeth2            \n\t"
4406    "paeth_abb2:                    \n\t"
4407       // pa <= pb; now test if pa <= pc
4408       "cmpl _pctemp, %%eax         \n\t"
4409       "jna paeth_abc2              \n\t"
4410       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4411       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4412       "jmp paeth_paeth2            \n\t"
4414    "paeth_abc2:                    \n\t"
4415       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4416       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4418    "paeth_paeth2:                  \n\t"
4419       "incl %%ebx                  \n\t"
4420       "incl %%edx                  \n\t"
4421       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4422       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4423       "cmpl _FullLength, %%ebx     \n\t"
4424       "jb paeth_lp2                \n\t"
4426    "paeth_end:                     \n\t"
4427       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
4428 #ifdef __PIC__
4429       "popl %%ebx                  \n\t" // restore index to Global Offset Table
4430 #endif
4432       : "=c" (dummy_value_c),            // output regs (dummy)
4433         "=S" (dummy_value_S),
4434         "=D" (dummy_value_D)
4436       : "0" (bpp),       // ecx          // input regs
4437         "1" (prev_row),  // esi
4438         "2" (row)        // edi
4440       : "%eax", "%edx"                   // clobber list (no input regs!)
4441 #ifndef __PIC__
4442       , "%ebx"
4443 #endif
4444    );
4446 } /* end png_read_filter_row_mmx_paeth() */
4447 #endif
4452 #ifdef PNG_THREAD_UNSAFE_OK
4453 //===========================================================================//
4454 //                                                                           //
4455 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
4456 //                                                                           //
4457 //===========================================================================//
4459 // Optimized code for PNG Sub filter decoder
4461 static void /* PRIVATE */
4462 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4464    int bpp;
4465    int dummy_value_a;
4466    int dummy_value_D;
4468    bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
4469    _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
4471    __asm__ __volatile__ (
4472 //pre "movl row, %%edi             \n\t"
4473       "movl %%edi, %%esi           \n\t" // lp = row
4474 //pre "movl bpp, %%eax             \n\t"
4475       "addl %%eax, %%edi           \n\t" // rp = row + bpp
4476 //irr "xorl %%eax, %%eax           \n\t"
4477       // get # of bytes to alignment
4478       "movl %%edi, _dif            \n\t" // take start of row
4479       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
4480                                          //  alignment boundary
4481       "xorl %%ecx, %%ecx           \n\t"
4482       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
4483       "subl %%edi, _dif            \n\t" // subtract from start ==> value
4484       "jz sub_go                   \n\t" //  ecx at alignment
4486    "sub_lp1:                       \n\t" // fix alignment
4487       "movb (%%esi,%%ecx,), %%al   \n\t"
4488       "addb %%al, (%%edi,%%ecx,)   \n\t"
4489       "incl %%ecx                  \n\t"
4490       "cmpl _dif, %%ecx            \n\t"
4491       "jb sub_lp1                  \n\t"
4493    "sub_go:                        \n\t"
4494       "movl _FullLength, %%eax     \n\t"
4495       "movl %%eax, %%edx           \n\t"
4496       "subl %%ecx, %%edx           \n\t" // subtract alignment fix
4497       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
4498       "subl %%edx, %%eax           \n\t" // drop over bytes from length
4499       "movl %%eax, _MMXLength      \n\t"
4501       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4502         "=D" (dummy_value_D)    // 1
4504       : "0" (bpp),              // eax    // input regs
4505         "1" (row)               // edi
4507       : "%ebx", "%ecx", "%edx"            // clobber list
4508       , "%esi"
4510 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4511       , "%mm0", "%mm1", "%mm2", "%mm3"
4512       , "%mm4", "%mm5", "%mm6", "%mm7"
4513 #endif
4514    );
4516    // now do the math for the rest of the row
4517    switch (bpp)
4518    {
4519       case 3:
4520       {
4521          _ActiveMask.use  = 0x0000ffffff000000LL;
4522          _ShiftBpp.use = 24;       // == 3 * 8
4523          _ShiftRem.use  = 40;      // == 64 - 24
4525          __asm__ __volatile__ (
4526 // preload  "movl row, %%edi              \n\t"
4527             "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
4528                                                 //  active byte group
4529             "movl %%edi, %%esi            \n\t" // lp = row
4530 // preload  "movl bpp, %%eax              \n\t"
4531             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4532             "movq %%mm7, %%mm6            \n\t"
4533             "movl _dif, %%edx             \n\t"
4534             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4535                                                 //  3rd active byte group
4536             // prime the pump:  load the first Raw(x-bpp) data set
4537             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4539          "sub_3lp:                        \n\t" // shift data for adding first
4540             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4541                                                 //  shift clears inactive bytes)
4542             // add 1st active group
4543             "movq (%%edi,%%edx,), %%mm0   \n\t"
4544             "paddb %%mm1, %%mm0           \n\t"
4546             // add 2nd active group
4547             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4548             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4549             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4550             "paddb %%mm1, %%mm0           \n\t"
4552             // add 3rd active group
4553             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4554             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4555             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4556             "addl $8, %%edx               \n\t"
4557             "paddb %%mm1, %%mm0           \n\t"
4559             "cmpl _MMXLength, %%edx       \n\t"
4560             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4561             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4562             "jb sub_3lp                   \n\t"
4564             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4565               "=D" (dummy_value_D)    // 1
4567             : "0" (bpp),              // eax    // input regs
4568               "1" (row)               // edi
4570             : "%edx", "%esi"                    // clobber list
4571 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4572             , "%mm0", "%mm1", "%mm6", "%mm7"
4573 #endif
4574          );
4575       }
4576       break;
4578       case 1:
4579       {
4580          __asm__ __volatile__ (
4581             "movl _dif, %%edx            \n\t"
4582 // preload  "movl row, %%edi             \n\t"
4583             "cmpl _FullLength, %%edx     \n\t"
4584             "jnb sub_1end                \n\t"
4585             "movl %%edi, %%esi           \n\t" // lp = row
4586             "xorl %%eax, %%eax           \n\t"
4587 // preload  "movl bpp, %%eax             \n\t"
4588             "addl %%eax, %%edi           \n\t" // rp = row + bpp
4590          "sub_1lp:                       \n\t"
4591             "movb (%%esi,%%edx,), %%al   \n\t"
4592             "addb %%al, (%%edi,%%edx,)   \n\t"
4593             "incl %%edx                  \n\t"
4594             "cmpl _FullLength, %%edx     \n\t"
4595             "jb sub_1lp                  \n\t"
4597          "sub_1end:                      \n\t"
4599             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4600               "=D" (dummy_value_D)    // 1
4602             : "0" (bpp),              // eax    // input regs
4603               "1" (row)               // edi
4605             : "%edx", "%esi"                    // clobber list
4606          );
4607       }
4608       return;
4610       case 6:
4611       case 4:
4612       //case 7:   // GRR BOGUS
4613       //case 5:   // GRR BOGUS
4614       {
4615          _ShiftBpp.use = bpp << 3;
4616          _ShiftRem.use = 64 - _ShiftBpp.use;
4618          __asm__ __volatile__ (
4619 // preload  "movl row, %%edi              \n\t"
4620             "movl _dif, %%edx             \n\t"
4621             "movl %%edi, %%esi            \n\t" // lp = row
4622 // preload  "movl bpp, %%eax              \n\t"
4623             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4625             // prime the pump:  load the first Raw(x-bpp) data set
4626             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4628          "sub_4lp:                        \n\t" // shift data for adding first
4629             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4630                                                 //  shift clears inactive bytes)
4631             "movq (%%edi,%%edx,), %%mm0   \n\t"
4632             "paddb %%mm1, %%mm0           \n\t"
4634             // add 2nd active group
4635             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4636             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4637             "addl $8, %%edx               \n\t"
4638             "paddb %%mm1, %%mm0           \n\t"
4640             "cmpl _MMXLength, %%edx       \n\t"
4641             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4642             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4643             "jb sub_4lp                   \n\t"
4645             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4646               "=D" (dummy_value_D)    // 1
4648             : "0" (bpp),              // eax    // input regs
4649               "1" (row)               // edi
4651             : "%edx", "%esi"                    // clobber list
4652 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4653             , "%mm0", "%mm1"
4654 #endif
4655          );
4656       }
4657       break;
4659       case 2:
4660       {
4661          _ActiveMask.use = 0x00000000ffff0000LL;
4662          _ShiftBpp.use = 16;       // == 2 * 8
4663          _ShiftRem.use = 48;       // == 64 - 16
4665          __asm__ __volatile__ (
4666             "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
4667                                                 //  active byte group
4668             "movl _dif, %%edx             \n\t"
4669             "movq %%mm7, %%mm6            \n\t"
4670 // preload  "movl row, %%edi              \n\t"
4671             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4672                                                 //  3rd active byte group
4673             "movl %%edi, %%esi            \n\t" // lp = row
4674             "movq %%mm6, %%mm5            \n\t"
4675 // preload  "movl bpp, %%eax              \n\t"
4676             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4677             "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
4678                                                 //  4th active byte group
4679             // prime the pump:  load the first Raw(x-bpp) data set
4680             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4682          "sub_2lp:                        \n\t" // shift data for adding first
4683             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4684                                                 //  shift clears inactive bytes)
4685             // add 1st active group
4686             "movq (%%edi,%%edx,), %%mm0   \n\t"
4687             "paddb %%mm1, %%mm0           \n\t"
4689             // add 2nd active group
4690             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4691             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4692             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4693             "paddb %%mm1, %%mm0           \n\t"
4695             // add 3rd active group
4696             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4697             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4698             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4699             "paddb %%mm1, %%mm0           \n\t"
4701             // add 4th active group
4702             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4703             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4704             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
4705             "addl $8, %%edx               \n\t"
4706             "paddb %%mm1, %%mm0           \n\t"
4707             "cmpl _MMXLength, %%edx       \n\t"
4708             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4709             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4710             "jb sub_2lp                   \n\t"
4712             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4713               "=D" (dummy_value_D)    // 1
4715             : "0" (bpp),              // eax    // input regs
4716               "1" (row)               // edi
4718             : "%edx", "%esi"                    // clobber list
4719 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4720             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4721 #endif
4722          );
4723       }
4724       break;
4726       case 8:
4727       {
4728          __asm__ __volatile__ (
4729 // preload  "movl row, %%edi              \n\t"
4730             "movl _dif, %%edx             \n\t"
4731             "movl %%edi, %%esi            \n\t" // lp = row
4732 // preload  "movl bpp, %%eax              \n\t"
4733             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4734             "movl _MMXLength, %%ecx       \n\t"
4736             // prime the pump:  load the first Raw(x-bpp) data set
4737             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4738             "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
4740          "sub_8lp:                        \n\t"
4741             "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
4742             "paddb %%mm7, %%mm0           \n\t"
4743             "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
4744             "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
4746             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4747             // This will be repeated for each group of 8 bytes with the 8th
4748             // group being used as the Raw(x-bpp) for the 1st group of the
4749             // next loop.
4751             "paddb %%mm0, %%mm1           \n\t"
4752             "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4753             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
4754             "paddb %%mm1, %%mm2           \n\t"
4755             "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4756             "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4757             "paddb %%mm2, %%mm3           \n\t"
4758             "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4759             "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4760             "paddb %%mm3, %%mm4           \n\t"
4761             "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4762             "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4763             "paddb %%mm4, %%mm5           \n\t"
4764             "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4765             "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4766             "paddb %%mm5, %%mm6           \n\t"
4767             "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4768             "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4769             "addl $64, %%edx              \n\t"
4770             "paddb %%mm6, %%mm7           \n\t"
4771             "cmpl %%ecx, %%edx            \n\t"
4772             "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4773             "jb sub_8lp                   \n\t"
4775             "cmpl _MMXLength, %%edx       \n\t"
4776             "jnb sub_8lt8                 \n\t"
4778          "sub_8lpA:                       \n\t"
4779             "movq (%%edi,%%edx,), %%mm0   \n\t"
4780             "addl $8, %%edx               \n\t"
4781             "paddb %%mm7, %%mm0           \n\t"
4782             "cmpl _MMXLength, %%edx       \n\t"
4783             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4784             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
4785                                                 //  to mm1 to be new Raw(x-bpp)
4786                                                 //  for next loop
4787             "jb sub_8lpA                  \n\t"
4789          "sub_8lt8:                       \n\t"
4791             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4792               "=D" (dummy_value_D)    // 1
4794             : "0" (bpp),              // eax    // input regs
4795               "1" (row)               // edi
4797             : "%ecx", "%edx", "%esi"            // clobber list
4798 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4799             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4800 #endif
4801          );
4802       }
4803       break;
4805       default:                // bpp greater than 8 bytes       GRR BOGUS
4806       {
4807          __asm__ __volatile__ (
4808             "movl _dif, %%edx             \n\t"
4809 // preload  "movl row, %%edi              \n\t"
4810             "movl %%edi, %%esi            \n\t" // lp = row
4811 // preload  "movl bpp, %%eax              \n\t"
4812             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4814          "sub_Alp:                        \n\t"
4815             "movq (%%edi,%%edx,), %%mm0   \n\t"
4816             "movq (%%esi,%%edx,), %%mm1   \n\t"
4817             "addl $8, %%edx               \n\t"
4818             "paddb %%mm1, %%mm0           \n\t"
4819             "cmpl _MMXLength, %%edx       \n\t"
4820             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4821                                                 //  -8 to offset addl edx
4822             "jb sub_Alp                   \n\t"
4824             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4825               "=D" (dummy_value_D)    // 1
4827             : "0" (bpp),              // eax    // input regs
4828               "1" (row)               // edi
4830             : "%edx", "%esi"                    // clobber list
4831 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832             , "%mm0", "%mm1"
4833 #endif
4834          );
4835       }
4836       break;
4838    } // end switch (bpp)
4840    __asm__ __volatile__ (
4841       "movl _MMXLength, %%edx       \n\t"
4842 //pre "movl row, %%edi              \n\t"
4843       "cmpl _FullLength, %%edx      \n\t"
4844       "jnb sub_end                  \n\t"
4846       "movl %%edi, %%esi            \n\t" // lp = row
4847 //pre "movl bpp, %%eax              \n\t"
4848       "addl %%eax, %%edi            \n\t" // rp = row + bpp
4849       "xorl %%eax, %%eax            \n\t"
4851    "sub_lp2:                        \n\t"
4852       "movb (%%esi,%%edx,), %%al    \n\t"
4853       "addb %%al, (%%edi,%%edx,)    \n\t"
4854       "incl %%edx                   \n\t"
4855       "cmpl _FullLength, %%edx      \n\t"
4856       "jb sub_lp2                   \n\t"
4858    "sub_end:                        \n\t"
4859       "EMMS                         \n\t" // end MMX instructions
4861       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4862         "=D" (dummy_value_D)    // 1
4864       : "0" (bpp),              // eax    // input regs
4865         "1" (row)               // edi
4867       : "%edx", "%esi"                    // clobber list
4868    );
4870 } // end of png_read_filter_row_mmx_sub()
4871 #endif
4876 //===========================================================================//
4877 //                                                                           //
4878 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
4879 //                                                                           //
4880 //===========================================================================//
4882 // Optimized code for PNG Up filter decoder
4884 static void /* PRIVATE */
4885 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4886                            png_bytep prev_row)
4888    png_uint_32 len;
4889    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
4890    int dummy_value_S;
4891    int dummy_value_D;
4893    len = row_info->rowbytes;              // number of bytes to filter
4895    __asm__ __volatile__ (
4896 //pre "movl row, %%edi              \n\t"
4897       // get # of bytes to alignment
4898       "movl %%edi, %%ecx            \n\t"
4899       "xorl %%ebx, %%ebx            \n\t"
4900       "addl $0x7, %%ecx             \n\t"
4901       "xorl %%eax, %%eax            \n\t"
4902       "andl $0xfffffff8, %%ecx      \n\t"
4903 //pre "movl prev_row, %%esi         \n\t"
4904       "subl %%edi, %%ecx            \n\t"
4905       "jz up_go                     \n\t"
4907    "up_lp1:                         \n\t" // fix alignment
4908       "movb (%%edi,%%ebx,), %%al    \n\t"
4909       "addb (%%esi,%%ebx,), %%al    \n\t"
4910       "incl %%ebx                   \n\t"
4911       "cmpl %%ecx, %%ebx            \n\t"
4912       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4913       "jb up_lp1                    \n\t" //  offset incl ebx
4915    "up_go:                          \n\t"
4916 //pre "movl len, %%edx              \n\t"
4917       "movl %%edx, %%ecx            \n\t"
4918       "subl %%ebx, %%edx            \n\t" // subtract alignment fix
4919       "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
4920       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4922       // unrolled loop - use all MMX registers and interleave to reduce
4923       // number of branch instructions (loops) and reduce partial stalls
4924    "up_loop:                        \n\t"
4925       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4926       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4927       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
4928       "paddb %%mm1, %%mm0           \n\t"
4929       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
4930       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
4931       "paddb %%mm3, %%mm2           \n\t"
4932       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4933       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
4934       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4935       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4936       "paddb %%mm5, %%mm4           \n\t"
4937       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4938       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4939       "paddb %%mm7, %%mm6           \n\t"
4940       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4941       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4942       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4943       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4944       "paddb %%mm1, %%mm0           \n\t"
4945       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4946       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4947       "paddb %%mm3, %%mm2           \n\t"
4948       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4949       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4950       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4951       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4952       "paddb %%mm5, %%mm4           \n\t"
4953       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4954       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4955       "addl $64, %%ebx              \n\t"
4956       "paddb %%mm7, %%mm6           \n\t"
4957       "cmpl %%ecx, %%ebx            \n\t"
4958       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4959       "jb up_loop                   \n\t" //  -8 to offset addl ebx
4961       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
4962       "jz up_end                    \n\t"
4964       "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
4965       "jb up_lt8                    \n\t" //  [added by lcreeve@netins.net]
4967       "addl %%edx, %%ecx            \n\t"
4968       "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
4969       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4970       "jz up_lt8                    \n\t"
4972    "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
4973       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4974       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4975       "addl $8, %%ebx               \n\t"
4976       "paddb %%mm1, %%mm0           \n\t"
4977       "cmpl %%ecx, %%ebx            \n\t"
4978       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4979       "jb up_lpA                    \n\t" //  offset add ebx
4980       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
4981       "jz up_end                    \n\t"
4983    "up_lt8:                         \n\t"
4984       "xorl %%eax, %%eax            \n\t"
4985       "addl %%edx, %%ecx            \n\t" // move over byte count into counter
4987    "up_lp2:                         \n\t" // use x86 regs for remaining bytes
4988       "movb (%%edi,%%ebx,), %%al    \n\t"
4989       "addb (%%esi,%%ebx,), %%al    \n\t"
4990       "incl %%ebx                   \n\t"
4991       "cmpl %%ecx, %%ebx            \n\t"
4992       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4993       "jb up_lp2                    \n\t" //  offset inc ebx
4995    "up_end:                         \n\t"
4996       "EMMS                         \n\t" // conversion of filtered row complete
4998       : "=d" (dummy_value_d),   // 0      // output regs (dummy)
4999         "=S" (dummy_value_S),   // 1
5000         "=D" (dummy_value_D)    // 2
5002       : "0" (len),              // edx    // input regs
5003         "1" (prev_row),         // esi
5004         "2" (row)               // edi
5006       : "%eax", "%ebx", "%ecx"            // clobber list (no input regs!)
5008 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5009       , "%mm0", "%mm1", "%mm2", "%mm3"
5010       , "%mm4", "%mm5", "%mm6", "%mm7"
5011 #endif
5012    );
5014 } // end of png_read_filter_row_mmx_up()
5016 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5021 /*===========================================================================*/
5022 /*                                                                           */
5023 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
5024 /*                                                                           */
5025 /*===========================================================================*/
5028 /* Optimized png_read_filter_row routines */
5030 void /* PRIVATE */
5031 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5032    row, png_bytep prev_row, int filter)
5034 #ifdef PNG_DEBUG
5035    char filnm[10];
5036 #endif
5038 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5039 /* GRR:  these are superseded by png_ptr->asm_flags: */
5040 #define UseMMX_sub    1   // GRR:  converted 20000730
5041 #define UseMMX_up     1   // GRR:  converted 20000729
5042 #define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
5043 #define UseMMX_paeth  1   // GRR:  converted 20000828
5045    if (_mmx_supported == 2) {
5046        /* this should have happened in png_init_mmx_flags() already */
5047        png_warning(png_ptr, "asm_flags may not have been initialized");
5048        png_mmx_support();
5049    }
5050 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5052 #ifdef PNG_DEBUG
5053    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5054    switch (filter)
5055    {
5056       case 0: sprintf(filnm, "none");
5057          break;
5058       case 1: sprintf(filnm, "sub-%s",
5059 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5060         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : 
5061 #endif
5062 "x86");
5063          break;
5064       case 2: sprintf(filnm, "up-%s",
5065 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5066         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5067 #endif
5068  "x86");
5069          break;
5070       case 3: sprintf(filnm, "avg-%s",
5071 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5072         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5073 #endif
5074  "x86");
5075          break;
5076       case 4: sprintf(filnm, "Paeth-%s",
5077 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5078         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5079 #endif
5080 "x86");
5081          break;
5082       default: sprintf(filnm, "unknw");
5083          break;
5084    }
5085    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5086    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5087    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5088       (int)((row_info->pixel_depth + 7) >> 3));
5089    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5090 #endif /* PNG_DEBUG */
5092    switch (filter)
5093    {
5094       case PNG_FILTER_VALUE_NONE:
5095          break;
5097       case PNG_FILTER_VALUE_SUB:
5098 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5099          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5100              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5101              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5102          {
5103             png_read_filter_row_mmx_sub(row_info, row);
5104          }
5105          else
5106 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5107          {
5108             png_uint_32 i;
5109             png_uint_32 istop = row_info->rowbytes;
5110             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5111             png_bytep rp = row + bpp;
5112             png_bytep lp = row;
5114             for (i = bpp; i < istop; i++)
5115             {
5116                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5117                rp++;
5118             }
5119          }  /* end !UseMMX_sub */
5120          break;
5122       case PNG_FILTER_VALUE_UP:
5123 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5124          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5125              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5126              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5127          {
5128             png_read_filter_row_mmx_up(row_info, row, prev_row);
5129          }
5130           else
5131 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5132          {
5133             png_uint_32 i;
5134             png_uint_32 istop = row_info->rowbytes;
5135             png_bytep rp = row;
5136             png_bytep pp = prev_row;
5138             for (i = 0; i < istop; ++i)
5139             {
5140                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5141                rp++;
5142             }
5143          }  /* end !UseMMX_up */
5144          break;
5146       case PNG_FILTER_VALUE_AVG:
5147 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5148          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5149              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5150              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5151          {
5152             png_read_filter_row_mmx_avg(row_info, row, prev_row);
5153          }
5154          else
5155 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5156          {
5157             png_uint_32 i;
5158             png_bytep rp = row;
5159             png_bytep pp = prev_row;
5160             png_bytep lp = row;
5161             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5162             png_uint_32 istop = row_info->rowbytes - bpp;
5164             for (i = 0; i < bpp; i++)
5165             {
5166                *rp = (png_byte)(((int)(*rp) +
5167                   ((int)(*pp++) >> 1)) & 0xff);
5168                rp++;
5169             }
5171             for (i = 0; i < istop; i++)
5172             {
5173                *rp = (png_byte)(((int)(*rp) +
5174                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5175                rp++;
5176             }
5177          }  /* end !UseMMX_avg */
5178          break;
5180       case PNG_FILTER_VALUE_PAETH:
5181 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5182          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5183              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5184              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5185          {
5186             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5187          }
5188          else
5189 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5190          {
5191             png_uint_32 i;
5192             png_bytep rp = row;
5193             png_bytep pp = prev_row;
5194             png_bytep lp = row;
5195             png_bytep cp = prev_row;
5196             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5197             png_uint_32 istop = row_info->rowbytes - bpp;
5199             for (i = 0; i < bpp; i++)
5200             {
5201                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5202                rp++;
5203             }
5205             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
5206             {
5207                int a, b, c, pa, pb, pc, p;
5209                a = *lp++;
5210                b = *pp++;
5211                c = *cp++;
5213                p = b - c;
5214                pc = a - c;
5216 #ifdef PNG_USE_ABS
5217                pa = abs(p);
5218                pb = abs(pc);
5219                pc = abs(p + pc);
5220 #else
5221                pa = p < 0 ? -p : p;
5222                pb = pc < 0 ? -pc : pc;
5223                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5224 #endif
5226                /*
5227                   if (pa <= pb && pa <= pc)
5228                      p = a;
5229                   else if (pb <= pc)
5230                      p = b;
5231                   else
5232                      p = c;
5233                 */
5235                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5237                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5238                rp++;
5239             }
5240          }  /* end !UseMMX_paeth */
5241          break;
5243       default:
5244          png_warning(png_ptr, "Ignoring bad row-filter type");
5245          *row=0;
5246          break;
5247    }
5250 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5253 /*===========================================================================*/
5254 /*                                                                           */
5255 /*                      P N G _ M M X _ S U P P O R T                        */
5256 /*                                                                           */
5257 /*===========================================================================*/
5259 /* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
5260  *             (2) all instructions compile with gcc 2.7.2.3 and later
5261  *             (3) the function is moved down here to prevent gcc from
5262  *                  inlining it in multiple places and then barfing be-
5263  *                  cause the ".NOT_SUPPORTED" label is multiply defined
5264  *             [is there a way to signal that a *single* function should
5265  *              not be inlined?  is there a way to modify the label for
5266  *              each inlined instance, e.g., by appending _1, _2, etc.?
5267  *              maybe if don't use leading "." in label name? (nope...sigh)]
5268  */
5270 int PNGAPI
5271 png_mmx_support(void)
5273 #if defined(PNG_MMX_CODE_SUPPORTED)
5274     __asm__ __volatile__ (
5275         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
5276         "pushl %%ecx          \n\t"  // so does ecx...
5277         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
5278 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
5279 //      "pushf                \n\t"  // 16-bit pushf
5280         "pushfl               \n\t"  // save Eflag to stack
5281         "popl %%eax           \n\t"  // get Eflag from stack into eax
5282         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
5283         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5284         "pushl %%eax          \n\t"  // save modified Eflag back to stack
5285 //      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
5286 //      "popf                 \n\t"  // 16-bit popf
5287         "popfl                \n\t"  // restore modified value to Eflag reg
5288         "pushfl               \n\t"  // save Eflag to stack
5289         "popl %%eax           \n\t"  // get Eflag from stack
5290         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
5291         "jz .NOT_SUPPORTED    \n\t"  // if same, CPUID instr. is not supported
5293         "xorl %%eax, %%eax    \n\t"  // set eax to zero
5294 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
5295         "cpuid                \n\t"  // get the CPU identification info
5296         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
5297         "jl .NOT_SUPPORTED    \n\t"  // if eax is zero, MMX is not supported
5299         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
5300         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
5301                                      // faster than the instruction "mov eax, 1"
5302         "cpuid                \n\t"  // get the CPU identification info again
5303         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5304         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
5305         "jz .NOT_SUPPORTED    \n\t"  // non-zero = yes, MMX IS supported
5307         "movl $1, %%eax       \n\t"  // set return value to 1
5308         "jmp  .RETURN         \n\t"  // DONE:  have MMX support
5310     ".NOT_SUPPORTED:          \n\t"  // target label for jump instructions
5311         "movl $0, %%eax       \n\t"  // set return value to 0
5312     ".RETURN:          \n\t"  // target label for jump instructions
5313         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5314         "popl %%edx           \n\t"  // restore edx
5315         "popl %%ecx           \n\t"  // restore ecx
5316         "popl %%ebx           \n\t"  // restore ebx
5318 //      "ret                  \n\t"  // DONE:  no MMX support
5319                                      // (fall through to standard C "ret")
5321         :                            // output list (none)
5323         :                            // any variables used on input (none)
5325         : "%eax"                     // clobber list
5326 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
5327 //      , "memory"   // if write to a variable gcc thought was in a reg
5328 //      , "cc"       // "condition codes" (flag bits)
5329     );
5330 #else     
5331     _mmx_supported = 0;
5332 #endif /* PNG_MMX_CODE_SUPPORTED */
5334     return _mmx_supported;
5338 #endif /* PNG_USE_PNGGCCRD */