1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4 *
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
8 *
9 * libpng 1.0.9 - January 31, 2001
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
13 *
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
17 *
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19 *
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21 *
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
23 *
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
26 * For djgpp, see
27 *
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29 *
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
32 *
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36 * binutils.tgz
37 *
38 * For other platforms, see the main GNU site:
39 *
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
41 *
42 * Version 2.5.2l.15 is definitely too old...
43 */
45 /*
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
48 *
49 * 19991006:
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51 *
52 * 19991007:
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
79 *
80 * 19991016:
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
89 *
90 * 19991023:
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
96 *
97 * 19991024:
98 * - fixed mmxsupport()/png_do_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
113 *
114 * 19991107:
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
117 *
118 * 19991120:
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
124 *
125 * 20000213:
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
127 *
128 * 20000319:
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
131 *
132 * 20000623:
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
139 *
140 * 20000706:
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
152 *
153 * 20000729:
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
160 * mapping of latter)
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
163 *
164 * 20000731:
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166 *
167 * 20000822:
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
173 *
174 * 20000823:
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
178 *
179 * 20000826:
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
183 *
184 * 20000828:
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
189 *
190 * 20000829:
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
195 *
196 * 20000914:
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
199 *
200 * 20000916:
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
205 *
206 * 20010103:
207 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
208 * and made it public
209 *
210 * 20010104:
211 * - removed dependency on png_read_filter_row_c() (C code already duplicated
212 * within MMX version of png_read_filter_row()) so no longer necessary to
213 * compile it into pngrutil.o
214 *
215 * STILL TO DO:
216 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
217 * - write MMX code for 48-bit case (pixel_bytes == 6)
218 * - figure out what's up with 24-bit case (pixel_bytes == 3):
219 * why subtract 8 from width_mmx in the pass 4/5 case?
220 * (only width_mmx case) (near line 1606)
221 * - rewrite all MMX interlacing code so it's aligned with beginning
222 * of the row buffer, not the end (see 19991007 for details)
223 * x pick one version of mmxsupport() and get rid of the other
224 * - add error messages to any remaining bogus default cases
225 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
226 * - add support for runtime enable/disable/query of various MMX routines
227 */
229 //#define PNG_DEBUG 2 // GRR
231 #define PNG_INTERNAL
232 #include "png.h"
234 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
236 #ifdef PNG_USE_LOCAL_ARRAYS
237 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
238 static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
239 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
240 #endif
242 // djgpp, Win32, and Cygwin add their own underscores to global variables,
243 // so define them without:
244 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
245 # define _mmx_supported mmx_supported
246 # define _unmask unmask
247 # define _const4 const4
248 # define _const6 const6
249 # define _mask8_0 mask8_0
250 # define _mask16_1 mask16_1
251 # define _mask16_0 mask16_0
252 # define _mask24_2 mask24_2
253 # define _mask24_1 mask24_1
254 # define _mask24_0 mask24_0
255 # define _mask32_3 mask32_3
256 # define _mask32_2 mask32_2
257 # define _mask32_1 mask32_1
258 # define _mask32_0 mask32_0
259 # define _mask48_5 mask48_5
260 # define _mask48_4 mask48_4
261 # define _mask48_3 mask48_3
262 # define _mask48_2 mask48_2
263 # define _mask48_1 mask48_1
264 # define _mask48_0 mask48_0
265 # define _FullLength FullLength
266 # define _MMXLength MMXLength
267 # define _dif dif
268 # define _LBCarryMask LBCarryMask
269 # define _HBClearMask HBClearMask
270 # define _ActiveMask ActiveMask
271 # define _ActiveMask2 ActiveMask2
272 # define _ActiveMaskEnd ActiveMaskEnd
273 # define _ShiftBpp ShiftBpp
274 # define _ShiftRem ShiftRem
275 # define _patemp patemp
276 # define _pbtemp pbtemp
277 # define _pctemp pctemp
278 #endif
280 static int _mmx_supported = 2;
282 /* These constants are used in the inlined MMX assembly code.
283 Ignore gcc's "At top level: defined but not used" warnings. */
285 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
286 * since that case uses the %ebx register for indexing the Global Offset Table
287 * and there were no other registers available. But gcc 2.95 and later emit
288 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
289 * in the non-PIC case, so we'll just use the global unconditionally now.
290 */
291 static int _unmask;
293 static unsigned long long _mask8_0 = 0x0102040810204080LL;
295 static unsigned long long _mask16_1 = 0x0101020204040808LL;
296 static unsigned long long _mask16_0 = 0x1010202040408080LL;
298 static unsigned long long _mask24_2 = 0x0101010202020404LL;
299 static unsigned long long _mask24_1 = 0x0408080810101020LL;
300 static unsigned long long _mask24_0 = 0x2020404040808080LL;
302 static unsigned long long _mask32_3 = 0x0101010102020202LL;
303 static unsigned long long _mask32_2 = 0x0404040408080808LL;
304 static unsigned long long _mask32_1 = 0x1010101020202020LL;
305 static unsigned long long _mask32_0 = 0x4040404080808080LL;
307 static unsigned long long _mask48_5 = 0x0101010101010202LL;
308 static unsigned long long _mask48_4 = 0x0202020204040404LL;
309 static unsigned long long _mask48_3 = 0x0404080808080808LL;
310 static unsigned long long _mask48_2 = 0x1010101010102020LL;
311 static unsigned long long _mask48_1 = 0x2020202040404040LL;
312 static unsigned long long _mask48_0 = 0x4040808080808080LL;
314 static unsigned long long _const4 = 0x0000000000FFFFFFLL;
315 //static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
316 static unsigned long long _const6 = 0x00000000000000FFLL;
318 // These are used in the row-filter routines and should/would be local
319 // variables if not for gcc addressing limitations.
321 static png_uint_32 _FullLength;
322 static png_uint_32 _MMXLength;
323 static int _dif;
324 static int _patemp; // temp variables for Paeth routine
325 static int _pbtemp;
326 static int _pctemp;
331 //===========================================================================//
332 // //
333 // P N G _ C O M B I N E _ R O W //
334 // //
335 //===========================================================================//
337 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
339 /* Combines the row recently read in with the previous row.
340 This routine takes care of alpha and transparency if requested.
341 This routine also handles the two methods of progressive display
342 of interlaced images, depending on the mask value.
343 The mask value describes which pixels are to be combined with
344 the row. The pattern always repeats every 8 pixels, so just 8
345 bits are needed. A one indicates the pixel is to be combined; a
346 zero indicates the pixel is to be skipped. This is in addition
347 to any alpha or transparency value associated with the pixel.
348 If you want all pixels to be combined, pass 0xff (255) in mask. */
350 /* Use this routine for the x86 platform - it uses a faster MMX routine
351 if the machine supports MMX. */
353 void /* PRIVATE */
354 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
355 {
356 png_debug(1,"in png_combine_row_asm\n");
358 if (_mmx_supported == 2) {
359 png_mmx_support();
360 }
362 if (mask == 0xff)
363 {
364 png_memcpy(row, png_ptr->row_buf + 1,
365 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
366 }
367 /* GRR: png_combine_row() never called with mask == 0 */
368 else
369 {
370 switch (png_ptr->row_info.pixel_depth)
371 {
372 case 1: // png_ptr->row_info.pixel_depth
373 {
374 png_bytep sp;
375 png_bytep dp;
376 int s_inc, s_start, s_end;
377 int m;
378 int shift;
379 png_uint_32 i;
381 sp = png_ptr->row_buf + 1;
382 dp = row;
383 m = 0x80;
384 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
385 if (png_ptr->transformations & PNG_PACKSWAP)
386 {
387 s_start = 0;
388 s_end = 7;
389 s_inc = 1;
390 }
391 else
392 #endif
393 {
394 s_start = 7;
395 s_end = 0;
396 s_inc = -1;
397 }
399 shift = s_start;
401 for (i = 0; i < png_ptr->width; i++)
402 {
403 if (m & mask)
404 {
405 int value;
407 value = (*sp >> shift) & 0x1;
408 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
409 *dp |= (png_byte)(value << shift);
410 }
412 if (shift == s_end)
413 {
414 shift = s_start;
415 sp++;
416 dp++;
417 }
418 else
419 shift += s_inc;
421 if (m == 1)
422 m = 0x80;
423 else
424 m >>= 1;
425 }
426 break;
427 }
429 case 2: // png_ptr->row_info.pixel_depth
430 {
431 png_bytep sp;
432 png_bytep dp;
433 int s_start, s_end, s_inc;
434 int m;
435 int shift;
436 png_uint_32 i;
437 int value;
439 sp = png_ptr->row_buf + 1;
440 dp = row;
441 m = 0x80;
442 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
443 if (png_ptr->transformations & PNG_PACKSWAP)
444 {
445 s_start = 0;
446 s_end = 6;
447 s_inc = 2;
448 }
449 else
450 #endif
451 {
452 s_start = 6;
453 s_end = 0;
454 s_inc = -2;
455 }
457 shift = s_start;
459 for (i = 0; i < png_ptr->width; i++)
460 {
461 if (m & mask)
462 {
463 value = (*sp >> shift) & 0x3;
464 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
465 *dp |= (png_byte)(value << shift);
466 }
468 if (shift == s_end)
469 {
470 shift = s_start;
471 sp++;
472 dp++;
473 }
474 else
475 shift += s_inc;
476 if (m == 1)
477 m = 0x80;
478 else
479 m >>= 1;
480 }
481 break;
482 }
484 case 4: // png_ptr->row_info.pixel_depth
485 {
486 png_bytep sp;
487 png_bytep dp;
488 int s_start, s_end, s_inc;
489 int m;
490 int shift;
491 png_uint_32 i;
492 int value;
494 sp = png_ptr->row_buf + 1;
495 dp = row;
496 m = 0x80;
497 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
498 if (png_ptr->transformations & PNG_PACKSWAP)
499 {
500 s_start = 0;
501 s_end = 4;
502 s_inc = 4;
503 }
504 else
505 #endif
506 {
507 s_start = 4;
508 s_end = 0;
509 s_inc = -4;
510 }
511 shift = s_start;
513 for (i = 0; i < png_ptr->width; i++)
514 {
515 if (m & mask)
516 {
517 value = (*sp >> shift) & 0xf;
518 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
519 *dp |= (png_byte)(value << shift);
520 }
522 if (shift == s_end)
523 {
524 shift = s_start;
525 sp++;
526 dp++;
527 }
528 else
529 shift += s_inc;
530 if (m == 1)
531 m = 0x80;
532 else
533 m >>= 1;
534 }
535 break;
536 }
538 case 8: // png_ptr->row_info.pixel_depth
539 {
540 png_bytep srcptr;
541 png_bytep dstptr;
543 if ( _mmx_supported )
544 {
545 png_uint_32 len;
546 int diff;
547 int dummy_value_a; // fix 'forbidden register spilled' error
548 int dummy_value_d;
549 int dummy_value_c;
550 int dummy_value_S;
551 int dummy_value_D;
552 _unmask = ~mask; // global variable for -fPIC version
553 srcptr = png_ptr->row_buf + 1;
554 dstptr = row;
555 len = png_ptr->width &~7; // reduce to multiple of 8
556 diff = png_ptr->width & 7; // amount lost
558 __asm__ __volatile__ (
559 "movd _unmask, %%mm7 \n\t" // load bit pattern
560 "psubb %%mm6, %%mm6 \n\t" // zero mm6
561 "punpcklbw %%mm7, %%mm7 \n\t"
562 "punpcklwd %%mm7, %%mm7 \n\t"
563 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
565 "movq _mask8_0, %%mm0 \n\t"
566 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
567 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
569 // preload "movl len, %%ecx \n\t" // load length of line
570 // preload "movl srcptr, %%esi \n\t" // load source
571 // preload "movl dstptr, %%edi \n\t" // load dest
573 "cmpl $0, %%ecx \n\t" // len == 0 ?
574 "je mainloop8end \n\t"
576 "mainloop8: \n\t"
577 "movq (%%esi), %%mm4 \n\t" // *srcptr
578 "pand %%mm0, %%mm4 \n\t"
579 "movq %%mm0, %%mm6 \n\t"
580 "pandn (%%edi), %%mm6 \n\t" // *dstptr
581 "por %%mm6, %%mm4 \n\t"
582 "movq %%mm4, (%%edi) \n\t"
583 "addl $8, %%esi \n\t" // inc by 8 bytes processed
584 "addl $8, %%edi \n\t"
585 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
586 "ja mainloop8 \n\t"
588 "mainloop8end: \n\t"
589 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
590 "movl %%eax, %%ecx \n\t"
591 "cmpl $0, %%ecx \n\t"
592 "jz end8 \n\t"
593 // preload "movl mask, %%edx \n\t"
594 "sall $24, %%edx \n\t" // make low byte, high byte
596 "secondloop8: \n\t"
597 "sall %%edx \n\t" // move high bit to CF
598 "jnc skip8 \n\t" // if CF = 0
599 "movb (%%esi), %%al \n\t"
600 "movb %%al, (%%edi) \n\t"
602 "skip8: \n\t"
603 "incl %%esi \n\t"
604 "incl %%edi \n\t"
605 "decl %%ecx \n\t"
606 "jnz secondloop8 \n\t"
608 "end8: \n\t"
609 "EMMS \n\t" // DONE
611 : "=a" (dummy_value_a), // output regs (dummy)
612 "=d" (dummy_value_d),
613 "=c" (dummy_value_c),
614 "=S" (dummy_value_S),
615 "=D" (dummy_value_D)
617 : "3" (srcptr), // esi // input regs
618 "4" (dstptr), // edi
619 "0" (diff), // eax
620 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
621 "2" (len), // ecx
622 "1" (mask) // edx
624 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
625 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
626 #endif
627 );
628 }
629 else /* mmx _not supported - Use modified C routine */
630 {
631 register png_uint_32 i;
632 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
633 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
634 register int stride = png_pass_inc[png_ptr->pass];
635 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
636 register int rep_bytes = png_pass_width[png_ptr->pass];
637 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
638 register png_uint_32 final_val = png_ptr->width;
640 srcptr = png_ptr->row_buf + 1 + initial_val;
641 dstptr = row + initial_val;
643 for (i = initial_val; i < final_val; i += stride)
644 {
645 png_memcpy(dstptr, srcptr, rep_bytes);
646 srcptr += stride;
647 dstptr += stride;
648 }
649 } /* end of else */
651 break;
652 } // end 8 bpp
654 case 16: // png_ptr->row_info.pixel_depth
655 {
656 png_bytep srcptr;
657 png_bytep dstptr;
659 if ( _mmx_supported )
660 {
661 png_uint_32 len;
662 int diff;
663 int dummy_value_a; // fix 'forbidden register spilled' error
664 int dummy_value_d;
665 int dummy_value_c;
666 int dummy_value_S;
667 int dummy_value_D;
668 _unmask = ~mask; // global variable for -fPIC version
669 srcptr = png_ptr->row_buf + 1;
670 dstptr = row;
671 len = png_ptr->width &~7; // reduce to multiple of 8
672 diff = png_ptr->width & 7; // amount lost
674 __asm__ __volatile__ (
675 "movd _unmask, %%mm7 \n\t" // load bit pattern
676 "psubb %%mm6, %%mm6 \n\t" // zero mm6
677 "punpcklbw %%mm7, %%mm7 \n\t"
678 "punpcklwd %%mm7, %%mm7 \n\t"
679 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
681 "movq _mask16_0, %%mm0 \n\t"
682 "movq _mask16_1, %%mm1 \n\t"
684 "pand %%mm7, %%mm0 \n\t"
685 "pand %%mm7, %%mm1 \n\t"
687 "pcmpeqb %%mm6, %%mm0 \n\t"
688 "pcmpeqb %%mm6, %%mm1 \n\t"
690 // preload "movl len, %%ecx \n\t" // load length of line
691 // preload "movl srcptr, %%esi \n\t" // load source
692 // preload "movl dstptr, %%edi \n\t" // load dest
694 "cmpl $0, %%ecx \n\t"
695 "jz mainloop16end \n\t"
697 "mainloop16: \n\t"
698 "movq (%%esi), %%mm4 \n\t"
699 "pand %%mm0, %%mm4 \n\t"
700 "movq %%mm0, %%mm6 \n\t"
701 "movq (%%edi), %%mm7 \n\t"
702 "pandn %%mm7, %%mm6 \n\t"
703 "por %%mm6, %%mm4 \n\t"
704 "movq %%mm4, (%%edi) \n\t"
706 "movq 8(%%esi), %%mm5 \n\t"
707 "pand %%mm1, %%mm5 \n\t"
708 "movq %%mm1, %%mm7 \n\t"
709 "movq 8(%%edi), %%mm6 \n\t"
710 "pandn %%mm6, %%mm7 \n\t"
711 "por %%mm7, %%mm5 \n\t"
712 "movq %%mm5, 8(%%edi) \n\t"
714 "addl $16, %%esi \n\t" // inc by 16 bytes processed
715 "addl $16, %%edi \n\t"
716 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
717 "ja mainloop16 \n\t"
719 "mainloop16end: \n\t"
720 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
721 "movl %%eax, %%ecx \n\t"
722 "cmpl $0, %%ecx \n\t"
723 "jz end16 \n\t"
724 // preload "movl mask, %%edx \n\t"
725 "sall $24, %%edx \n\t" // make low byte, high byte
727 "secondloop16: \n\t"
728 "sall %%edx \n\t" // move high bit to CF
729 "jnc skip16 \n\t" // if CF = 0
730 "movw (%%esi), %%ax \n\t"
731 "movw %%ax, (%%edi) \n\t"
733 "skip16: \n\t"
734 "addl $2, %%esi \n\t"
735 "addl $2, %%edi \n\t"
736 "decl %%ecx \n\t"
737 "jnz secondloop16 \n\t"
739 "end16: \n\t"
740 "EMMS \n\t" // DONE
742 : "=a" (dummy_value_a), // output regs (dummy)
743 "=c" (dummy_value_c),
744 "=d" (dummy_value_d),
745 "=S" (dummy_value_S),
746 "=D" (dummy_value_D)
748 : "0" (diff), // eax // input regs
749 // was (unmask) " " RESERVED // ebx // Global Offset Table idx
750 "1" (len), // ecx
751 "2" (mask), // edx
752 "3" (srcptr), // esi
753 "4" (dstptr) // edi
755 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
756 : "%mm0", "%mm1", "%mm4" // clobber list
757 , "%mm5", "%mm6", "%mm7"
758 #endif
759 );
760 }
761 else /* mmx _not supported - Use modified C routine */
762 {
763 register png_uint_32 i;
764 png_uint_32 initial_val = 2 * png_pass_start[png_ptr->pass];
765 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
766 register int stride = 2 * png_pass_inc[png_ptr->pass];
767 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
768 register int rep_bytes = 2 * png_pass_width[png_ptr->pass];
769 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
770 register png_uint_32 final_val = 2 * png_ptr->width;
772 srcptr = png_ptr->row_buf + 1 + initial_val;
773 dstptr = row + initial_val;
775 for (i = initial_val; i < final_val; i += stride)
776 {
777 png_memcpy(dstptr, srcptr, rep_bytes);
778 srcptr += stride;
779 dstptr += stride;
780 }
781 } /* end of else */
783 break;
784 } // end 16 bpp
786 case 24: // png_ptr->row_info.pixel_depth
787 {
788 png_bytep srcptr;
789 png_bytep dstptr;
791 if ( _mmx_supported )
792 {
793 png_uint_32 len;
794 int diff;
795 int dummy_value_a; // fix 'forbidden register spilled' error
796 int dummy_value_d;
797 int dummy_value_c;
798 int dummy_value_S;
799 int dummy_value_D;
800 _unmask = ~mask; // global variable for -fPIC version
801 srcptr = png_ptr->row_buf + 1;
802 dstptr = row;
803 len = png_ptr->width &~7; // reduce to multiple of 8
804 diff = png_ptr->width & 7; // amount lost
806 __asm__ __volatile__ (
807 "movd _unmask, %%mm7 \n\t" // load bit pattern
808 "psubb %%mm6, %%mm6 \n\t" // zero mm6
809 "punpcklbw %%mm7, %%mm7 \n\t"
810 "punpcklwd %%mm7, %%mm7 \n\t"
811 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
813 "movq _mask24_0, %%mm0 \n\t"
814 "movq _mask24_1, %%mm1 \n\t"
815 "movq _mask24_2, %%mm2 \n\t"
817 "pand %%mm7, %%mm0 \n\t"
818 "pand %%mm7, %%mm1 \n\t"
819 "pand %%mm7, %%mm2 \n\t"
821 "pcmpeqb %%mm6, %%mm0 \n\t"
822 "pcmpeqb %%mm6, %%mm1 \n\t"
823 "pcmpeqb %%mm6, %%mm2 \n\t"
825 // preload "movl len, %%ecx \n\t" // load length of line
826 // preload "movl srcptr, %%esi \n\t" // load source
827 // preload "movl dstptr, %%edi \n\t" // load dest
829 "cmpl $0, %%ecx \n\t"
830 "jz mainloop24end \n\t"
832 "mainloop24: \n\t"
833 "movq (%%esi), %%mm4 \n\t"
834 "pand %%mm0, %%mm4 \n\t"
835 "movq %%mm0, %%mm6 \n\t"
836 "movq (%%edi), %%mm7 \n\t"
837 "pandn %%mm7, %%mm6 \n\t"
838 "por %%mm6, %%mm4 \n\t"
839 "movq %%mm4, (%%edi) \n\t"
841 "movq 8(%%esi), %%mm5 \n\t"
842 "pand %%mm1, %%mm5 \n\t"
843 "movq %%mm1, %%mm7 \n\t"
844 "movq 8(%%edi), %%mm6 \n\t"
845 "pandn %%mm6, %%mm7 \n\t"
846 "por %%mm7, %%mm5 \n\t"
847 "movq %%mm5, 8(%%edi) \n\t"
849 "movq 16(%%esi), %%mm6 \n\t"
850 "pand %%mm2, %%mm6 \n\t"
851 "movq %%mm2, %%mm4 \n\t"
852 "movq 16(%%edi), %%mm7 \n\t"
853 "pandn %%mm7, %%mm4 \n\t"
854 "por %%mm4, %%mm6 \n\t"
855 "movq %%mm6, 16(%%edi) \n\t"
857 "addl $24, %%esi \n\t" // inc by 24 bytes processed
858 "addl $24, %%edi \n\t"
859 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
861 "ja mainloop24 \n\t"
863 "mainloop24end: \n\t"
864 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
865 "movl %%eax, %%ecx \n\t"
866 "cmpl $0, %%ecx \n\t"
867 "jz end24 \n\t"
868 // preload "movl mask, %%edx \n\t"
869 "sall $24, %%edx \n\t" // make low byte, high byte
871 "secondloop24: \n\t"
872 "sall %%edx \n\t" // move high bit to CF
873 "jnc skip24 \n\t" // if CF = 0
874 "movw (%%esi), %%ax \n\t"
875 "movw %%ax, (%%edi) \n\t"
876 "xorl %%eax, %%eax \n\t"
877 "movb 2(%%esi), %%al \n\t"
878 "movb %%al, 2(%%edi) \n\t"
880 "skip24: \n\t"
881 "addl $3, %%esi \n\t"
882 "addl $3, %%edi \n\t"
883 "decl %%ecx \n\t"
884 "jnz secondloop24 \n\t"
886 "end24: \n\t"
887 "EMMS \n\t" // DONE
889 : "=a" (dummy_value_a), // output regs (dummy)
890 "=d" (dummy_value_d),
891 "=c" (dummy_value_c),
892 "=S" (dummy_value_S),
893 "=D" (dummy_value_D)
895 : "3" (srcptr), // esi // input regs
896 "4" (dstptr), // edi
897 "0" (diff), // eax
898 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
899 "2" (len), // ecx
900 "1" (mask) // edx
902 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
903 : "%mm0", "%mm1", "%mm2" // clobber list
904 , "%mm4", "%mm5", "%mm6", "%mm7"
905 #endif
906 );
907 }
908 else /* mmx _not supported - Use modified C routine */
909 {
910 register png_uint_32 i;
911 png_uint_32 initial_val = 3 * png_pass_start[png_ptr->pass];
912 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
913 register int stride = 3 * png_pass_inc[png_ptr->pass];
914 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
915 register int rep_bytes = 3 * png_pass_width[png_ptr->pass];
916 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
917 register png_uint_32 final_val = 3 * png_ptr->width;
919 srcptr = png_ptr->row_buf + 1 + initial_val;
920 dstptr = row + initial_val;
922 for (i = initial_val; i < final_val; i += stride)
923 {
924 png_memcpy(dstptr, srcptr, rep_bytes);
925 srcptr += stride;
926 dstptr += stride;
927 }
928 } /* end of else */
930 break;
931 } // end 24 bpp
933 case 32: // png_ptr->row_info.pixel_depth
934 {
935 png_bytep srcptr;
936 png_bytep dstptr;
938 if ( _mmx_supported )
939 {
940 png_uint_32 len;
941 int diff;
942 int dummy_value_a; // fix 'forbidden register spilled' error
943 int dummy_value_d;
944 int dummy_value_c;
945 int dummy_value_S;
946 int dummy_value_D;
947 _unmask = ~mask; // global variable for -fPIC version
948 srcptr = png_ptr->row_buf + 1;
949 dstptr = row;
950 len = png_ptr->width &~7; // reduce to multiple of 8
951 diff = png_ptr->width & 7; // amount lost
953 __asm__ __volatile__ (
954 "movd _unmask, %%mm7 \n\t" // load bit pattern
955 "psubb %%mm6, %%mm6 \n\t" // zero mm6
956 "punpcklbw %%mm7, %%mm7 \n\t"
957 "punpcklwd %%mm7, %%mm7 \n\t"
958 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
960 "movq _mask32_0, %%mm0 \n\t"
961 "movq _mask32_1, %%mm1 \n\t"
962 "movq _mask32_2, %%mm2 \n\t"
963 "movq _mask32_3, %%mm3 \n\t"
965 "pand %%mm7, %%mm0 \n\t"
966 "pand %%mm7, %%mm1 \n\t"
967 "pand %%mm7, %%mm2 \n\t"
968 "pand %%mm7, %%mm3 \n\t"
970 "pcmpeqb %%mm6, %%mm0 \n\t"
971 "pcmpeqb %%mm6, %%mm1 \n\t"
972 "pcmpeqb %%mm6, %%mm2 \n\t"
973 "pcmpeqb %%mm6, %%mm3 \n\t"
975 // preload "movl len, %%ecx \n\t" // load length of line
976 // preload "movl srcptr, %%esi \n\t" // load source
977 // preload "movl dstptr, %%edi \n\t" // load dest
979 "cmpl $0, %%ecx \n\t" // lcr
980 "jz mainloop32end \n\t"
982 "mainloop32: \n\t"
983 "movq (%%esi), %%mm4 \n\t"
984 "pand %%mm0, %%mm4 \n\t"
985 "movq %%mm0, %%mm6 \n\t"
986 "movq (%%edi), %%mm7 \n\t"
987 "pandn %%mm7, %%mm6 \n\t"
988 "por %%mm6, %%mm4 \n\t"
989 "movq %%mm4, (%%edi) \n\t"
991 "movq 8(%%esi), %%mm5 \n\t"
992 "pand %%mm1, %%mm5 \n\t"
993 "movq %%mm1, %%mm7 \n\t"
994 "movq 8(%%edi), %%mm6 \n\t"
995 "pandn %%mm6, %%mm7 \n\t"
996 "por %%mm7, %%mm5 \n\t"
997 "movq %%mm5, 8(%%edi) \n\t"
999 "movq 16(%%esi), %%mm6 \n\t"
1000 "pand %%mm2, %%mm6 \n\t"
1001 "movq %%mm2, %%mm4 \n\t"
1002 "movq 16(%%edi), %%mm7 \n\t"
1003 "pandn %%mm7, %%mm4 \n\t"
1004 "por %%mm4, %%mm6 \n\t"
1005 "movq %%mm6, 16(%%edi) \n\t"
1007 "movq 24(%%esi), %%mm7 \n\t"
1008 "pand %%mm3, %%mm7 \n\t"
1009 "movq %%mm3, %%mm5 \n\t"
1010 "movq 24(%%edi), %%mm4 \n\t"
1011 "pandn %%mm4, %%mm5 \n\t"
1012 "por %%mm5, %%mm7 \n\t"
1013 "movq %%mm7, 24(%%edi) \n\t"
1015 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1016 "addl $32, %%edi \n\t"
1017 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1018 "ja mainloop32 \n\t"
1020 "mainloop32end: \n\t"
1021 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1022 "movl %%eax, %%ecx \n\t"
1023 "cmpl $0, %%ecx \n\t"
1024 "jz end32 \n\t"
1025 // preload "movl mask, %%edx \n\t"
1026 "sall $24, %%edx \n\t" // low byte => high byte
1028 "secondloop32: \n\t"
1029 "sall %%edx \n\t" // move high bit to CF
1030 "jnc skip32 \n\t" // if CF = 0
1031 "movl (%%esi), %%eax \n\t"
1032 "movl %%eax, (%%edi) \n\t"
1034 "skip32: \n\t"
1035 "addl $4, %%esi \n\t"
1036 "addl $4, %%edi \n\t"
1037 "decl %%ecx \n\t"
1038 "jnz secondloop32 \n\t"
1040 "end32: \n\t"
1041 "EMMS \n\t" // DONE
1043 : "=a" (dummy_value_a), // output regs (dummy)
1044 "=d" (dummy_value_d),
1045 "=c" (dummy_value_c),
1046 "=S" (dummy_value_S),
1047 "=D" (dummy_value_D)
1049 : "3" (srcptr), // esi // input regs
1050 "4" (dstptr), // edi
1051 "0" (diff), // eax
1052 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1053 "2" (len), // ecx
1054 "1" (mask) // edx
1056 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1057 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1058 , "%mm4", "%mm5", "%mm6", "%mm7"
1059 #endif
1060 );
1061 }
1062 else /* mmx _not supported - Use modified C routine */
1063 {
1064 register png_uint_32 i;
1065 png_uint_32 initial_val = 4 * png_pass_start[png_ptr->pass];
1066 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1067 register int stride = 4 * png_pass_inc[png_ptr->pass];
1068 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1069 register int rep_bytes = 4 * png_pass_width[png_ptr->pass];
1070 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1071 register png_uint_32 final_val = 4 * png_ptr->width;
1073 srcptr = png_ptr->row_buf + 1 + initial_val;
1074 dstptr = row + initial_val;
1076 for (i = initial_val; i < final_val; i += stride)
1077 {
1078 png_memcpy(dstptr, srcptr, rep_bytes);
1079 srcptr += stride;
1080 dstptr += stride;
1081 }
1082 } /* end of else */
1084 break;
1085 } // end 32 bpp
1087 case 48: // png_ptr->row_info.pixel_depth
1088 {
1089 png_bytep srcptr;
1090 png_bytep dstptr;
1092 if ( _mmx_supported )
1093 {
1094 png_uint_32 len;
1095 int diff;
1096 int dummy_value_a; // fix 'forbidden register spilled' error
1097 int dummy_value_d;
1098 int dummy_value_c;
1099 int dummy_value_S;
1100 int dummy_value_D;
1101 _unmask = ~mask; // global variable for -fPIC version
1102 srcptr = png_ptr->row_buf + 1;
1103 dstptr = row;
1104 len = png_ptr->width &~7; // reduce to multiple of 8
1105 diff = png_ptr->width & 7; // amount lost
1107 __asm__ __volatile__ (
1108 "movd _unmask, %%mm7 \n\t" // load bit pattern
1109 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1110 "punpcklbw %%mm7, %%mm7 \n\t"
1111 "punpcklwd %%mm7, %%mm7 \n\t"
1112 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1114 "movq _mask48_0, %%mm0 \n\t"
1115 "movq _mask48_1, %%mm1 \n\t"
1116 "movq _mask48_2, %%mm2 \n\t"
1117 "movq _mask48_3, %%mm3 \n\t"
1118 "movq _mask48_4, %%mm4 \n\t"
1119 "movq _mask48_5, %%mm5 \n\t"
1121 "pand %%mm7, %%mm0 \n\t"
1122 "pand %%mm7, %%mm1 \n\t"
1123 "pand %%mm7, %%mm2 \n\t"
1124 "pand %%mm7, %%mm3 \n\t"
1125 "pand %%mm7, %%mm4 \n\t"
1126 "pand %%mm7, %%mm5 \n\t"
1128 "pcmpeqb %%mm6, %%mm0 \n\t"
1129 "pcmpeqb %%mm6, %%mm1 \n\t"
1130 "pcmpeqb %%mm6, %%mm2 \n\t"
1131 "pcmpeqb %%mm6, %%mm3 \n\t"
1132 "pcmpeqb %%mm6, %%mm4 \n\t"
1133 "pcmpeqb %%mm6, %%mm5 \n\t"
1135 // preload "movl len, %%ecx \n\t" // load length of line
1136 // preload "movl srcptr, %%esi \n\t" // load source
1137 // preload "movl dstptr, %%edi \n\t" // load dest
1139 "cmpl $0, %%ecx \n\t"
1140 "jz mainloop48end \n\t"
1142 "mainloop48: \n\t"
1143 "movq (%%esi), %%mm7 \n\t"
1144 "pand %%mm0, %%mm7 \n\t"
1145 "movq %%mm0, %%mm6 \n\t"
1146 "pandn (%%edi), %%mm6 \n\t"
1147 "por %%mm6, %%mm7 \n\t"
1148 "movq %%mm7, (%%edi) \n\t"
1150 "movq 8(%%esi), %%mm6 \n\t"
1151 "pand %%mm1, %%mm6 \n\t"
1152 "movq %%mm1, %%mm7 \n\t"
1153 "pandn 8(%%edi), %%mm7 \n\t"
1154 "por %%mm7, %%mm6 \n\t"
1155 "movq %%mm6, 8(%%edi) \n\t"
1157 "movq 16(%%esi), %%mm6 \n\t"
1158 "pand %%mm2, %%mm6 \n\t"
1159 "movq %%mm2, %%mm7 \n\t"
1160 "pandn 16(%%edi), %%mm7 \n\t"
1161 "por %%mm7, %%mm6 \n\t"
1162 "movq %%mm6, 16(%%edi) \n\t"
1164 "movq 24(%%esi), %%mm7 \n\t"
1165 "pand %%mm3, %%mm7 \n\t"
1166 "movq %%mm3, %%mm6 \n\t"
1167 "pandn 24(%%edi), %%mm6 \n\t"
1168 "por %%mm6, %%mm7 \n\t"
1169 "movq %%mm7, 24(%%edi) \n\t"
1171 "movq 32(%%esi), %%mm6 \n\t"
1172 "pand %%mm4, %%mm6 \n\t"
1173 "movq %%mm4, %%mm7 \n\t"
1174 "pandn 32(%%edi), %%mm7 \n\t"
1175 "por %%mm7, %%mm6 \n\t"
1176 "movq %%mm6, 32(%%edi) \n\t"
1178 "movq 40(%%esi), %%mm7 \n\t"
1179 "pand %%mm5, %%mm7 \n\t"
1180 "movq %%mm5, %%mm6 \n\t"
1181 "pandn 40(%%edi), %%mm6 \n\t"
1182 "por %%mm6, %%mm7 \n\t"
1183 "movq %%mm7, 40(%%edi) \n\t"
1185 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1186 "addl $48, %%edi \n\t"
1187 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1189 "ja mainloop48 \n\t"
1191 "mainloop48end: \n\t"
1192 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1193 "movl %%eax, %%ecx \n\t"
1194 "cmpl $0, %%ecx \n\t"
1195 "jz end48 \n\t"
1196 // preload "movl mask, %%edx \n\t"
1197 "sall $24, %%edx \n\t" // make low byte, high byte
1199 "secondloop48: \n\t"
1200 "sall %%edx \n\t" // move high bit to CF
1201 "jnc skip48 \n\t" // if CF = 0
1202 "movl (%%esi), %%eax \n\t"
1203 "movl %%eax, (%%edi) \n\t"
1205 "skip48: \n\t"
1206 "addl $4, %%esi \n\t"
1207 "addl $4, %%edi \n\t"
1208 "decl %%ecx \n\t"
1209 "jnz secondloop48 \n\t"
1211 "end48: \n\t"
1212 "EMMS \n\t" // DONE
1214 : "=a" (dummy_value_a), // output regs (dummy)
1215 "=d" (dummy_value_d),
1216 "=c" (dummy_value_c),
1217 "=S" (dummy_value_S),
1218 "=D" (dummy_value_D)
1220 : "3" (srcptr), // esi // input regs
1221 "4" (dstptr), // edi
1222 "0" (diff), // eax
1223 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1224 "2" (len), // ecx
1225 "1" (mask) // edx
1227 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1228 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1229 , "%mm4", "%mm5", "%mm6", "%mm7"
1230 #endif
1231 );
1232 }
1233 else /* mmx _not supported - Use modified C routine */
1234 {
1235 register png_uint_32 i;
1236 png_uint_32 initial_val = 6 * png_pass_start[png_ptr->pass];
1237 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1238 register int stride = 6 * png_pass_inc[png_ptr->pass];
1239 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1240 register int rep_bytes = 6 * png_pass_width[png_ptr->pass];
1241 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1242 register png_uint_32 final_val = 6 * png_ptr->width;
1244 srcptr = png_ptr->row_buf + 1 + initial_val;
1245 dstptr = row + initial_val;
1247 for (i = initial_val; i < final_val; i += stride)
1248 {
1249 png_memcpy(dstptr, srcptr, rep_bytes);
1250 srcptr += stride;
1251 dstptr += stride;
1252 }
1253 } /* end of else */
1255 break;
1256 } // end 48 bpp
1258 case 64: // png_ptr->row_info.pixel_depth
1259 {
1260 png_bytep srcptr;
1261 png_bytep dstptr;
1262 register png_uint_32 i;
1263 png_uint_32 initial_val = 8 * png_pass_start[png_ptr->pass];
1264 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1265 register int stride = 8 * png_pass_inc[png_ptr->pass];
1266 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1267 register int rep_bytes = 8 * png_pass_width[png_ptr->pass];
1268 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1269 register png_uint_32 final_val = 8 * png_ptr->width;
1271 srcptr = png_ptr->row_buf + 1 + initial_val;
1272 dstptr = row + initial_val;
1274 for (i = initial_val; i < final_val; i += stride)
1275 {
1276 png_memcpy(dstptr, srcptr, rep_bytes);
1277 srcptr += stride;
1278 dstptr += stride;
1279 }
1280 break;
1281 } // end 64 bpp
1283 default: // png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64
1284 {
1285 // this should never happen
1286 fprintf(stderr,
1287 "libpng internal error: png_ptr->row_info.pixel_depth = %d\n",
1288 png_ptr->row_info.pixel_depth);
1289 fflush(stderr);
1290 break;
1291 }
1292 } /* end switch (png_ptr->row_info.pixel_depth) */
1294 } /* end if (non-trivial mask) */
1296 } /* end png_combine_row() */
1298 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1303 //===========================================================================//
1304 // //
1305 // P N G _ D O _ R E A D _ I N T E R L A C E //
1306 // //
1307 //===========================================================================//
1309 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1310 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1312 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1313 * has taken place. [GRR: what other steps come before and/or after?]
1314 */
1316 void /* PRIVATE */
1317 png_do_read_interlace(png_structp png_ptr)
1318 {
1319 png_row_infop row_info = &(png_ptr->row_info);
1320 png_bytep row = png_ptr->row_buf + 1;
1321 int pass = png_ptr->pass;
1322 png_uint_32 transformations = png_ptr->transformations;
1324 png_debug(1,"in png_do_read_interlace\n");
1326 if (_mmx_supported == 2) {
1327 png_mmx_support();
1328 }
1330 if (row != NULL && row_info != NULL)
1331 {
1332 png_uint_32 final_width;
1334 final_width = row_info->width * png_pass_inc[pass];
1336 switch (row_info->pixel_depth)
1337 {
1338 case 1:
1339 {
1340 png_bytep sp, dp;
1341 int sshift, dshift;
1342 int s_start, s_end, s_inc;
1343 png_byte v;
1344 png_uint_32 i;
1345 int j;
1347 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1348 dp = row + (png_size_t)((final_width - 1) >> 3);
1349 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1350 if (transformations & PNG_PACKSWAP)
1351 {
1352 sshift = (int)((row_info->width + 7) & 7);
1353 dshift = (int)((final_width + 7) & 7);
1354 s_start = 7;
1355 s_end = 0;
1356 s_inc = -1;
1357 }
1358 else
1359 #endif
1360 {
1361 sshift = 7 - (int)((row_info->width + 7) & 7);
1362 dshift = 7 - (int)((final_width + 7) & 7);
1363 s_start = 0;
1364 s_end = 7;
1365 s_inc = 1;
1366 }
1368 for (i = row_info->width; i; i--)
1369 {
1370 v = (png_byte)((*sp >> sshift) & 0x1);
1371 for (j = 0; j < png_pass_inc[pass]; j++)
1372 {
1373 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1374 *dp |= (png_byte)(v << dshift);
1375 if (dshift == s_end)
1376 {
1377 dshift = s_start;
1378 dp--;
1379 }
1380 else
1381 dshift += s_inc;
1382 }
1383 if (sshift == s_end)
1384 {
1385 sshift = s_start;
1386 sp--;
1387 }
1388 else
1389 sshift += s_inc;
1390 }
1391 break;
1392 }
1394 case 2:
1395 {
1396 png_bytep sp, dp;
1397 int sshift, dshift;
1398 int s_start, s_end, s_inc;
1399 png_uint_32 i;
1401 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1402 dp = row + (png_size_t)((final_width - 1) >> 2);
1403 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1404 if (transformations & PNG_PACKSWAP)
1405 {
1406 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1407 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1408 s_start = 6;
1409 s_end = 0;
1410 s_inc = -2;
1411 }
1412 else
1413 #endif
1414 {
1415 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1416 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1417 s_start = 0;
1418 s_end = 6;
1419 s_inc = 2;
1420 }
1422 for (i = row_info->width; i; i--)
1423 {
1424 png_byte v;
1425 int j;
1427 v = (png_byte)((*sp >> sshift) & 0x3);
1428 for (j = 0; j < png_pass_inc[pass]; j++)
1429 {
1430 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1431 *dp |= (png_byte)(v << dshift);
1432 if (dshift == s_end)
1433 {
1434 dshift = s_start;
1435 dp--;
1436 }
1437 else
1438 dshift += s_inc;
1439 }
1440 if (sshift == s_end)
1441 {
1442 sshift = s_start;
1443 sp--;
1444 }
1445 else
1446 sshift += s_inc;
1447 }
1448 break;
1449 }
1451 case 4:
1452 {
1453 png_bytep sp, dp;
1454 int sshift, dshift;
1455 int s_start, s_end, s_inc;
1456 png_uint_32 i;
1458 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1459 dp = row + (png_size_t)((final_width - 1) >> 1);
1460 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1461 if (transformations & PNG_PACKSWAP)
1462 {
1463 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1464 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1465 s_start = 4;
1466 s_end = 0;
1467 s_inc = -4;
1468 }
1469 else
1470 #endif
1471 {
1472 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1473 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1474 s_start = 0;
1475 s_end = 4;
1476 s_inc = 4;
1477 }
1479 for (i = row_info->width; i; i--)
1480 {
1481 png_byte v;
1482 int j;
1484 v = (png_byte)((*sp >> sshift) & 0xf);
1485 for (j = 0; j < png_pass_inc[pass]; j++)
1486 {
1487 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1488 *dp |= (png_byte)(v << dshift);
1489 if (dshift == s_end)
1490 {
1491 dshift = s_start;
1492 dp--;
1493 }
1494 else
1495 dshift += s_inc;
1496 }
1497 if (sshift == s_end)
1498 {
1499 sshift = s_start;
1500 sp--;
1501 }
1502 else
1503 sshift += s_inc;
1504 }
1505 break;
1506 }
1508 //====================================================================
1510 default: // 8-bit or larger (this is where the routine is modified)
1511 {
1512 // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1513 // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1514 // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1515 // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1516 png_bytep sptr, dp;
1517 png_uint_32 i;
1518 png_size_t pixel_bytes;
1519 int width = row_info->width;
1521 pixel_bytes = (row_info->pixel_depth >> 3);
1523 // point sptr at the last pixel in the pre-expanded row:
1524 sptr = row + (width - 1) * pixel_bytes;
1526 // point dp at the last pixel position in the expanded row:
1527 dp = row + (final_width - 1) * pixel_bytes;
1529 // New code by Nirav Chhatrapati - Intel Corporation
1531 if ( _mmx_supported )
1532 {
1533 //--------------------------------------------------------------
1534 if (pixel_bytes == 3)
1535 {
1536 if (((pass == 0) || (pass == 1)) && width)
1537 {
1538 int dummy_value_c; // fix 'forbidden register spilled'
1539 int dummy_value_S;
1540 int dummy_value_D;
1542 __asm__ __volatile__ (
1543 "subl $21, %%edi \n\t"
1544 // (png_pass_inc[pass] - 1)*pixel_bytes
1546 ".loop3_pass0: \n\t"
1547 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1548 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1549 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1550 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1551 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1552 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1553 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1554 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1555 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1556 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1557 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1558 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1559 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1560 "movq %%mm4, 16(%%edi) \n\t"
1561 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1562 "movq %%mm3, 8(%%edi) \n\t"
1563 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1564 "subl $3, %%esi \n\t"
1565 "movq %%mm0, (%%edi) \n\t"
1566 "subl $24, %%edi \n\t"
1567 "decl %%ecx \n\t"
1568 "jnz .loop3_pass0 \n\t"
1569 "EMMS \n\t" // DONE
1571 : "=c" (dummy_value_c), // output regs (dummy)
1572 "=S" (dummy_value_S),
1573 "=D" (dummy_value_D)
1575 : "1" (sptr), // esi // input regs
1576 "2" (dp), // edi
1577 "0" (width) // ecx
1578 // doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4)
1580 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1581 : "%mm0", "%mm1", "%mm2" // clobber list
1582 , "%mm3", "%mm4"
1583 #endif
1584 );
1585 }
1586 else if (((pass == 2) || (pass == 3)) && width)
1587 {
1588 int dummy_value_c; // fix 'forbidden register spilled'
1589 int dummy_value_S;
1590 int dummy_value_D;
1592 __asm__ __volatile__ (
1593 "subl $9, %%edi \n\t"
1594 // (png_pass_inc[pass] - 1)*pixel_bytes
1596 ".loop3_pass2: \n\t"
1597 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1598 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1599 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1600 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1601 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1602 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1603 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1604 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1605 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1606 "movq %%mm0, 4(%%edi) \n\t"
1607 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1608 "subl $3, %%esi \n\t"
1609 "movd %%mm0, (%%edi) \n\t"
1610 "subl $12, %%edi \n\t"
1611 "decl %%ecx \n\t"
1612 "jnz .loop3_pass2 \n\t"
1613 "EMMS \n\t" // DONE
1615 : "=c" (dummy_value_c), // output regs (dummy)
1616 "=S" (dummy_value_S),
1617 "=D" (dummy_value_D)
1619 : "1" (sptr), // esi // input regs
1620 "2" (dp), // edi
1621 "0" (width) // ecx
1623 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1624 : "%mm0", "%mm1", "%mm2" // clobber list
1625 #endif
1626 );
1627 }
1628 else if (width) /* && ((pass == 4) || (pass == 5)) */
1629 {
1630 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1631 if (width_mmx < 0)
1632 width_mmx = 0;
1633 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1634 if (width_mmx)
1635 {
1636 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1637 // sptr points at last pixel in pre-expanded row
1638 // dp points at last pixel position in expanded row
1639 int dummy_value_c; // fix 'forbidden register spilled'
1640 int dummy_value_S;
1641 int dummy_value_D;
1643 __asm__ __volatile__ (
1644 "subl $3, %%esi \n\t"
1645 "subl $9, %%edi \n\t"
1646 // (png_pass_inc[pass] + 1)*pixel_bytes
1648 ".loop3_pass4: \n\t"
1649 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1650 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1651 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1652 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1653 "pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
1654 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1655 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1656 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1657 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1658 "movq %%mm0, (%%edi) \n\t"
1659 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1660 "pand _const6, %%mm3 \n\t" // z z z z z z z 5
1661 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1662 "subl $6, %%esi \n\t"
1663 "movd %%mm2, 8(%%edi) \n\t"
1664 "subl $12, %%edi \n\t"
1665 "subl $2, %%ecx \n\t"
1666 "jnz .loop3_pass4 \n\t"
1667 "EMMS \n\t" // DONE
1669 : "=c" (dummy_value_c), // output regs (dummy)
1670 "=S" (dummy_value_S),
1671 "=D" (dummy_value_D)
1673 : "1" (sptr), // esi // input regs
1674 "2" (dp), // edi
1675 "0" (width_mmx) // ecx
1677 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1678 : "%mm0", "%mm1" // clobber list
1679 , "%mm2", "%mm3"
1680 #endif
1681 );
1682 }
1684 sptr -= width_mmx*3;
1685 dp -= width_mmx*6;
1686 for (i = width; i; i--)
1687 {
1688 png_byte v[8];
1689 int j;
1691 png_memcpy(v, sptr, 3);
1692 for (j = 0; j < png_pass_inc[pass]; j++)
1693 {
1694 png_memcpy(dp, v, 3);
1695 dp -= 3;
1696 }
1697 sptr -= 3;
1698 }
1699 }
1700 } /* end of pixel_bytes == 3 */
1702 //--------------------------------------------------------------
1703 else if (pixel_bytes == 1)
1704 {
1705 if (((pass == 0) || (pass == 1)) && width)
1706 {
1707 int width_mmx = ((width >> 2) << 2);
1708 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1709 if (width_mmx)
1710 {
1711 int dummy_value_c; // fix 'forbidden register spilled'
1712 int dummy_value_S;
1713 int dummy_value_D;
1715 __asm__ __volatile__ (
1716 "subl $3, %%esi \n\t"
1717 "subl $31, %%edi \n\t"
1719 ".loop1_pass0: \n\t"
1720 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1721 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1722 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1723 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1724 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1725 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1726 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1727 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1728 "movq %%mm0, (%%edi) \n\t"
1729 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1730 "movq %%mm3, 8(%%edi) \n\t"
1731 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1732 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1733 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1734 "movq %%mm2, 16(%%edi) \n\t"
1735 "subl $4, %%esi \n\t"
1736 "movq %%mm4, 24(%%edi) \n\t"
1737 "subl $32, %%edi \n\t"
1738 "subl $4, %%ecx \n\t"
1739 "jnz .loop1_pass0 \n\t"
1740 "EMMS \n\t" // DONE
1742 : "=c" (dummy_value_c), // output regs (dummy)
1743 "=S" (dummy_value_S),
1744 "=D" (dummy_value_D)
1746 : "1" (sptr), // esi // input regs
1747 "2" (dp), // edi
1748 "0" (width_mmx) // ecx
1750 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1751 : "%mm0", "%mm1", "%mm2" // clobber list
1752 , "%mm3", "%mm4"
1753 #endif
1754 );
1755 }
1757 sptr -= width_mmx;
1758 dp -= width_mmx*8;
1759 for (i = width; i; i--)
1760 {
1761 int j;
1763 /* I simplified this part in version 1.0.4e
1764 * here and in several other instances where
1765 * pixel_bytes == 1 -- GR-P
1766 *
1767 * Original code:
1768 *
1769 * png_byte v[8];
1770 * png_memcpy(v, sptr, pixel_bytes);
1771 * for (j = 0; j < png_pass_inc[pass]; j++)
1772 * {
1773 * png_memcpy(dp, v, pixel_bytes);
1774 * dp -= pixel_bytes;
1775 * }
1776 * sptr -= pixel_bytes;
1777 *
1778 * Replacement code is in the next three lines:
1779 */
1781 for (j = 0; j < png_pass_inc[pass]; j++)
1782 *dp-- = *sptr;
1783 --sptr;
1784 }
1785 }
1786 else if (((pass == 2) || (pass == 3)) && width)
1787 {
1788 int width_mmx = ((width >> 2) << 2);
1789 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1790 if (width_mmx)
1791 {
1792 int dummy_value_c; // fix 'forbidden register spilled'
1793 int dummy_value_S;
1794 int dummy_value_D;
1796 __asm__ __volatile__ (
1797 "subl $3, %%esi \n\t"
1798 "subl $15, %%edi \n\t"
1800 ".loop1_pass2: \n\t"
1801 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1802 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1803 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
1804 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1805 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
1806 "movq %%mm0, (%%edi) \n\t"
1807 "subl $4, %%esi \n\t"
1808 "movq %%mm1, 8(%%edi) \n\t"
1809 "subl $16, %%edi \n\t"
1810 "subl $4, %%ecx \n\t"
1811 "jnz .loop1_pass2 \n\t"
1812 "EMMS \n\t" // DONE
1814 : "=c" (dummy_value_c), // output regs (dummy)
1815 "=S" (dummy_value_S),
1816 "=D" (dummy_value_D)
1818 : "1" (sptr), // esi // input regs
1819 "2" (dp), // edi
1820 "0" (width_mmx) // ecx
1822 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1823 : "%mm0", "%mm1" // clobber list
1824 #endif
1825 );
1826 }
1828 sptr -= width_mmx;
1829 dp -= width_mmx*4;
1830 for (i = width; i; i--)
1831 {
1832 int j;
1834 for (j = 0; j < png_pass_inc[pass]; j++)
1835 *dp-- = *sptr;
1836 --sptr;
1837 }
1838 }
1839 else if (width) /* && ((pass == 4) || (pass == 5)) */
1840 {
1841 int width_mmx = ((width >> 3) << 3);
1842 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1843 if (width_mmx)
1844 {
1845 int dummy_value_c; // fix 'forbidden register spilled'
1846 int dummy_value_S;
1847 int dummy_value_D;
1849 __asm__ __volatile__ (
1850 "subl $7, %%esi \n\t"
1851 "subl $15, %%edi \n\t"
1853 ".loop1_pass4: \n\t"
1854 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
1855 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
1856 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1857 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
1858 "movq %%mm1, 8(%%edi) \n\t"
1859 "subl $8, %%esi \n\t"
1860 "movq %%mm0, (%%edi) \n\t"
1861 "subl $16, %%edi \n\t"
1862 "subl $8, %%ecx \n\t"
1863 "jnz .loop1_pass4 \n\t"
1864 "EMMS \n\t" // DONE
1866 : "=c" (dummy_value_c), // output regs (none)
1867 "=S" (dummy_value_S),
1868 "=D" (dummy_value_D)
1870 : "1" (sptr), // esi // input regs
1871 "2" (dp), // edi
1872 "0" (width_mmx) // ecx
1874 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1875 : "%mm0", "%mm1" // clobber list
1876 #endif
1877 );
1878 }
1880 sptr -= width_mmx;
1881 dp -= width_mmx*2;
1882 for (i = width; i; i--)
1883 {
1884 int j;
1886 for (j = 0; j < png_pass_inc[pass]; j++)
1887 *dp-- = *sptr;
1888 --sptr;
1889 }
1890 }
1891 } /* end of pixel_bytes == 1 */
1893 //--------------------------------------------------------------
1894 else if (pixel_bytes == 2)
1895 {
1896 if (((pass == 0) || (pass == 1)) && width)
1897 {
1898 int width_mmx = ((width >> 1) << 1);
1899 width -= width_mmx; // 0,1 pixels => 0,2 bytes
1900 if (width_mmx)
1901 {
1902 int dummy_value_c; // fix 'forbidden register spilled'
1903 int dummy_value_S;
1904 int dummy_value_D;
1906 __asm__ __volatile__ (
1907 "subl $2, %%esi \n\t"
1908 "subl $30, %%edi \n\t"
1910 ".loop2_pass0: \n\t"
1911 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1912 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
1913 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
1914 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
1915 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
1916 "movq %%mm0, (%%edi) \n\t"
1917 "movq %%mm0, 8(%%edi) \n\t"
1918 "movq %%mm1, 16(%%edi) \n\t"
1919 "subl $4, %%esi \n\t"
1920 "movq %%mm1, 24(%%edi) \n\t"
1921 "subl $32, %%edi \n\t"
1922 "subl $2, %%ecx \n\t"
1923 "jnz .loop2_pass0 \n\t"
1924 "EMMS \n\t" // DONE
1926 : "=c" (dummy_value_c), // output regs (dummy)
1927 "=S" (dummy_value_S),
1928 "=D" (dummy_value_D)
1930 : "1" (sptr), // esi // input regs
1931 "2" (dp), // edi
1932 "0" (width_mmx) // ecx
1934 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1935 : "%mm0", "%mm1" // clobber list
1936 #endif
1937 );
1938 }
1940 sptr -= (width_mmx*2 - 2); // sign fixed
1941 dp -= (width_mmx*16 - 2); // sign fixed
1942 for (i = width; i; i--)
1943 {
1944 png_byte v[8];
1945 int j;
1946 sptr -= 2;
1947 png_memcpy(v, sptr, 2);
1948 for (j = 0; j < png_pass_inc[pass]; j++)
1949 {
1950 dp -= 2;
1951 png_memcpy(dp, v, 2);
1952 }
1953 }
1954 }
1955 else if (((pass == 2) || (pass == 3)) && width)
1956 {
1957 int width_mmx = ((width >> 1) << 1) ;
1958 width -= width_mmx; // 0,1 pixels => 0,2 bytes
1959 if (width_mmx)
1960 {
1961 int dummy_value_c; // fix 'forbidden register spilled'
1962 int dummy_value_S;
1963 int dummy_value_D;
1965 __asm__ __volatile__ (
1966 "subl $2, %%esi \n\t"
1967 "subl $14, %%edi \n\t"
1969 ".loop2_pass2: \n\t"
1970 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1971 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
1972 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
1973 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
1974 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
1975 "movq %%mm0, (%%edi) \n\t"
1976 "subl $4, %%esi \n\t"
1977 "movq %%mm1, 8(%%edi) \n\t"
1978 "subl $16, %%edi \n\t"
1979 "subl $2, %%ecx \n\t"
1980 "jnz .loop2_pass2 \n\t"
1981 "EMMS \n\t" // DONE
1983 : "=c" (dummy_value_c), // output regs (dummy)
1984 "=S" (dummy_value_S),
1985 "=D" (dummy_value_D)
1987 : "1" (sptr), // esi // input regs
1988 "2" (dp), // edi
1989 "0" (width_mmx) // ecx
1991 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1992 : "%mm0", "%mm1" // clobber list
1993 #endif
1994 );
1995 }
1997 sptr -= (width_mmx*2 - 2); // sign fixed
1998 dp -= (width_mmx*8 - 2); // sign fixed
1999 for (i = width; i; i--)
2000 {
2001 png_byte v[8];
2002 int j;
2003 sptr -= 2;
2004 png_memcpy(v, sptr, 2);
2005 for (j = 0; j < png_pass_inc[pass]; j++)
2006 {
2007 dp -= 2;
2008 png_memcpy(dp, v, 2);
2009 }
2010 }
2011 }
2012 else if (width) // pass == 4 or 5
2013 {
2014 int width_mmx = ((width >> 1) << 1) ;
2015 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2016 if (width_mmx)
2017 {
2018 int dummy_value_c; // fix 'forbidden register spilled'
2019 int dummy_value_S;
2020 int dummy_value_D;
2022 __asm__ __volatile__ (
2023 "subl $2, %%esi \n\t"
2024 "subl $6, %%edi \n\t"
2026 ".loop2_pass4: \n\t"
2027 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2028 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2029 "subl $4, %%esi \n\t"
2030 "movq %%mm0, (%%edi) \n\t"
2031 "subl $8, %%edi \n\t"
2032 "subl $2, %%ecx \n\t"
2033 "jnz .loop2_pass4 \n\t"
2034 "EMMS \n\t" // DONE
2036 : "=c" (dummy_value_c), // output regs (dummy)
2037 "=S" (dummy_value_S),
2038 "=D" (dummy_value_D)
2040 : "1" (sptr), // esi // input regs
2041 "2" (dp), // edi
2042 "0" (width_mmx) // ecx
2044 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2045 : "%mm0" // clobber list
2046 #endif
2047 );
2048 }
2050 sptr -= (width_mmx*2 - 2); // sign fixed
2051 dp -= (width_mmx*4 - 2); // sign fixed
2052 for (i = width; i; i--)
2053 {
2054 png_byte v[8];
2055 int j;
2056 sptr -= 2;
2057 png_memcpy(v, sptr, 2);
2058 for (j = 0; j < png_pass_inc[pass]; j++)
2059 {
2060 dp -= 2;
2061 png_memcpy(dp, v, 2);
2062 }
2063 }
2064 }
2065 } /* end of pixel_bytes == 2 */
2067 //--------------------------------------------------------------
2068 else if (pixel_bytes == 4)
2069 {
2070 if (((pass == 0) || (pass == 1)) && width)
2071 {
2072 int width_mmx = ((width >> 1) << 1);
2073 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2074 if (width_mmx)
2075 {
2076 int dummy_value_c; // fix 'forbidden register spilled'
2077 int dummy_value_S;
2078 int dummy_value_D;
2080 __asm__ __volatile__ (
2081 "subl $4, %%esi \n\t"
2082 "subl $60, %%edi \n\t"
2084 ".loop4_pass0: \n\t"
2085 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2086 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2087 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2088 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2089 "movq %%mm0, (%%edi) \n\t"
2090 "movq %%mm0, 8(%%edi) \n\t"
2091 "movq %%mm0, 16(%%edi) \n\t"
2092 "movq %%mm0, 24(%%edi) \n\t"
2093 "movq %%mm1, 32(%%edi) \n\t"
2094 "movq %%mm1, 40(%%edi) \n\t"
2095 "movq %%mm1, 48(%%edi) \n\t"
2096 "subl $8, %%esi \n\t"
2097 "movq %%mm1, 56(%%edi) \n\t"
2098 "subl $64, %%edi \n\t"
2099 "subl $2, %%ecx \n\t"
2100 "jnz .loop4_pass0 \n\t"
2101 "EMMS \n\t" // DONE
2103 : "=c" (dummy_value_c), // output regs (dummy)
2104 "=S" (dummy_value_S),
2105 "=D" (dummy_value_D)
2107 : "1" (sptr), // esi // input regs
2108 "2" (dp), // edi
2109 "0" (width_mmx) // ecx
2111 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2112 : "%mm0", "%mm1" // clobber list
2113 #endif
2114 );
2115 }
2117 sptr -= (width_mmx*4 - 4); // sign fixed
2118 dp -= (width_mmx*32 - 4); // sign fixed
2119 for (i = width; i; i--)
2120 {
2121 png_byte v[8];
2122 int j;
2123 sptr -= 4;
2124 png_memcpy(v, sptr, 4);
2125 for (j = 0; j < png_pass_inc[pass]; j++)
2126 {
2127 dp -= 4;
2128 png_memcpy(dp, v, 4);
2129 }
2130 }
2131 }
2132 else if (((pass == 2) || (pass == 3)) && width)
2133 {
2134 int width_mmx = ((width >> 1) << 1);
2135 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2136 if (width_mmx)
2137 {
2138 int dummy_value_c; // fix 'forbidden register spilled'
2139 int dummy_value_S;
2140 int dummy_value_D;
2142 __asm__ __volatile__ (
2143 "subl $4, %%esi \n\t"
2144 "subl $28, %%edi \n\t"
2146 ".loop4_pass2: \n\t"
2147 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2148 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2149 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2150 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2151 "movq %%mm0, (%%edi) \n\t"
2152 "movq %%mm0, 8(%%edi) \n\t"
2153 "movq %%mm1, 16(%%edi) \n\t"
2154 "movq %%mm1, 24(%%edi) \n\t"
2155 "subl $8, %%esi \n\t"
2156 "subl $32, %%edi \n\t"
2157 "subl $2, %%ecx \n\t"
2158 "jnz .loop4_pass2 \n\t"
2159 "EMMS \n\t" // DONE
2161 : "=c" (dummy_value_c), // output regs (dummy)
2162 "=S" (dummy_value_S),
2163 "=D" (dummy_value_D)
2165 : "1" (sptr), // esi // input regs
2166 "2" (dp), // edi
2167 "0" (width_mmx) // ecx
2169 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2170 : "%mm0", "%mm1" // clobber list
2171 #endif
2172 );
2173 }
2175 sptr -= (width_mmx*4 - 4); // sign fixed
2176 dp -= (width_mmx*16 - 4); // sign fixed
2177 for (i = width; i; i--)
2178 {
2179 png_byte v[8];
2180 int j;
2181 sptr -= 4;
2182 png_memcpy(v, sptr, 4);
2183 for (j = 0; j < png_pass_inc[pass]; j++)
2184 {
2185 dp -= 4;
2186 png_memcpy(dp, v, 4);
2187 }
2188 }
2189 }
2190 else if (width) // pass == 4 or 5
2191 {
2192 int width_mmx = ((width >> 1) << 1) ;
2193 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2194 if (width_mmx)
2195 {
2196 int dummy_value_c; // fix 'forbidden register spilled'
2197 int dummy_value_S;
2198 int dummy_value_D;
2200 __asm__ __volatile__ (
2201 "subl $4, %%esi \n\t"
2202 "subl $12, %%edi \n\t"
2204 ".loop4_pass4: \n\t"
2205 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2206 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2207 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2208 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2209 "movq %%mm0, (%%edi) \n\t"
2210 "subl $8, %%esi \n\t"
2211 "movq %%mm1, 8(%%edi) \n\t"
2212 "subl $16, %%edi \n\t"
2213 "subl $2, %%ecx \n\t"
2214 "jnz .loop4_pass4 \n\t"
2215 "EMMS \n\t" // DONE
2217 : "=c" (dummy_value_c), // output regs (dummy)
2218 "=S" (dummy_value_S),
2219 "=D" (dummy_value_D)
2221 : "1" (sptr), // esi // input regs
2222 "2" (dp), // edi
2223 "0" (width_mmx) // ecx
2225 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2226 : "%mm0", "%mm1" // clobber list
2227 #endif
2228 );
2229 }
2231 sptr -= (width_mmx*4 - 4); // sign fixed
2232 dp -= (width_mmx*8 - 4); // sign fixed
2233 for (i = width; i; i--)
2234 {
2235 png_byte v[8];
2236 int j;
2237 sptr -= 4;
2238 png_memcpy(v, sptr, 4);
2239 for (j = 0; j < png_pass_inc[pass]; j++)
2240 {
2241 dp -= 4;
2242 png_memcpy(dp, v, 4);
2243 }
2244 }
2245 }
2246 } /* end of pixel_bytes == 4 */
2248 //--------------------------------------------------------------
2249 else if (pixel_bytes == 8)
2250 {
2251 // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2252 // GRR NOTE: no need to combine passes here!
2253 if (((pass == 0) || (pass == 1)) && width)
2254 {
2255 int dummy_value_c; // fix 'forbidden register spilled'
2256 int dummy_value_S;
2257 int dummy_value_D;
2259 // source is 8-byte RRGGBBAA
2260 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2261 __asm__ __volatile__ (
2262 "subl $56, %%edi \n\t" // start of last block
2264 ".loop8_pass0: \n\t"
2265 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2266 "movq %%mm0, (%%edi) \n\t"
2267 "movq %%mm0, 8(%%edi) \n\t"
2268 "movq %%mm0, 16(%%edi) \n\t"
2269 "movq %%mm0, 24(%%edi) \n\t"
2270 "movq %%mm0, 32(%%edi) \n\t"
2271 "movq %%mm0, 40(%%edi) \n\t"
2272 "movq %%mm0, 48(%%edi) \n\t"
2273 "subl $8, %%esi \n\t"
2274 "movq %%mm0, 56(%%edi) \n\t"
2275 "subl $64, %%edi \n\t"
2276 "decl %%ecx \n\t"
2277 "jnz .loop8_pass0 \n\t"
2278 "EMMS \n\t" // DONE
2280 : "=c" (dummy_value_c), // output regs (dummy)
2281 "=S" (dummy_value_S),
2282 "=D" (dummy_value_D)
2284 : "1" (sptr), // esi // input regs
2285 "2" (dp), // edi
2286 "0" (width) // ecx
2288 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2289 : "%mm0" // clobber list
2290 #endif
2291 );
2292 }
2293 else if (((pass == 2) || (pass == 3)) && width)
2294 {
2295 // source is 8-byte RRGGBBAA
2296 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2297 int width_mmx = ((width >> 1) << 1) ;
2298 width -= width_mmx;
2299 if (width_mmx)
2300 {
2301 int dummy_value_c; // fix 'forbidden register spilled'
2302 int dummy_value_S;
2303 int dummy_value_D;
2305 __asm__ __volatile__ (
2306 "subl $24, %%edi \n\t" // start of last block
2308 ".loop8_pass2: \n\t"
2309 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2310 "movq %%mm0, (%%edi) \n\t"
2311 "movq %%mm0, 8(%%edi) \n\t"
2312 "movq %%mm0, 16(%%edi) \n\t"
2313 "subl $8, %%esi \n\t"
2314 "movq %%mm0, 24(%%edi) \n\t"
2315 "subl $32, %%edi \n\t"
2316 "decl %%ecx \n\t"
2317 "jnz .loop8_pass2 \n\t"
2318 "EMMS \n\t" // DONE
2320 : "=c" (dummy_value_c), // output regs (dummy)
2321 "=S" (dummy_value_S),
2322 "=D" (dummy_value_D)
2324 : "1" (sptr), // esi // input regs
2325 "2" (dp), // edi
2326 "0" (width) // ecx
2328 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2329 : "%mm0" // clobber list
2330 #endif
2331 );
2332 }
2333 }
2334 else if (width) // pass == 4 or 5
2335 {
2336 // source is 8-byte RRGGBBAA
2337 // dest is 16-byte RRGGBBAA RRGGBBAA
2338 int width_mmx = ((width >> 1) << 1) ;
2339 width -= width_mmx;
2340 if (width_mmx)
2341 {
2342 int dummy_value_c; // fix 'forbidden register spilled'
2343 int dummy_value_S;
2344 int dummy_value_D;
2346 __asm__ __volatile__ (
2347 "subl $8, %%edi \n\t" // start of last block
2349 ".loop8_pass4: \n\t"
2350 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2351 "movq %%mm0, (%%edi) \n\t"
2352 "subl $8, %%esi \n\t"
2353 "movq %%mm0, 8(%%edi) \n\t"
2354 "subl $16, %%edi \n\t"
2355 "decl %%ecx \n\t"
2356 "jnz .loop8_pass4 \n\t"
2357 "EMMS \n\t" // DONE
2359 : "=c" (dummy_value_c), // output regs (dummy)
2360 "=S" (dummy_value_S),
2361 "=D" (dummy_value_D)
2363 : "1" (sptr), // esi // input regs
2364 "2" (dp), // edi
2365 "0" (width) // ecx
2367 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2368 : "%mm0" // clobber list
2369 #endif
2370 );
2371 }
2372 }
2374 } /* end of pixel_bytes == 8 */
2376 //--------------------------------------------------------------
2377 else if (pixel_bytes == 6)
2378 {
2379 for (i = width; i; i--)
2380 {
2381 png_byte v[8];
2382 int j;
2383 png_memcpy(v, sptr, 6);
2384 for (j = 0; j < png_pass_inc[pass]; j++)
2385 {
2386 png_memcpy(dp, v, 6);
2387 dp -= 6;
2388 }
2389 sptr -= 6;
2390 }
2391 } /* end of pixel_bytes == 6 */
2393 //--------------------------------------------------------------
2394 else
2395 {
2396 for (i = width; i; i--)
2397 {
2398 png_byte v[8];
2399 int j;
2400 png_memcpy(v, sptr, pixel_bytes);
2401 for (j = 0; j < png_pass_inc[pass]; j++)
2402 {
2403 png_memcpy(dp, v, pixel_bytes);
2404 dp -= pixel_bytes;
2405 }
2406 sptr-= pixel_bytes;
2407 }
2408 }
2409 } // end of _mmx_supported ========================================
2411 else /* MMX not supported: use modified C code - takes advantage
2412 * of inlining of memcpy for a constant */
2413 /* GRR 19991007: does it? or should pixel_bytes in each
2414 * block be replaced with immediate value (e.g., 1)? */
2415 /* GRR 19991017: replaced with constants in each case */
2416 {
2417 if (pixel_bytes == 1)
2418 {
2419 for (i = width; i; i--)
2420 {
2421 int j;
2422 for (j = 0; j < png_pass_inc[pass]; j++)
2423 *dp-- = *sptr;
2424 --sptr;
2425 }
2426 }
2427 else if (pixel_bytes == 3)
2428 {
2429 for (i = width; i; i--)
2430 {
2431 png_byte v[8];
2432 int j;
2433 png_memcpy(v, sptr, 3);
2434 for (j = 0; j < png_pass_inc[pass]; j++)
2435 {
2436 png_memcpy(dp, v, 3);
2437 dp -= 3;
2438 }
2439 sptr -= 3;
2440 }
2441 }
2442 else if (pixel_bytes == 2)
2443 {
2444 for (i = width; i; i--)
2445 {
2446 png_byte v[8];
2447 int j;
2448 png_memcpy(v, sptr, 2);
2449 for (j = 0; j < png_pass_inc[pass]; j++)
2450 {
2451 png_memcpy(dp, v, 2);
2452 dp -= 2;
2453 }
2454 sptr -= 2;
2455 }
2456 }
2457 else if (pixel_bytes == 4)
2458 {
2459 for (i = width; i; i--)
2460 {
2461 png_byte v[8];
2462 int j;
2463 png_memcpy(v, sptr, 4);
2464 for (j = 0; j < png_pass_inc[pass]; j++)
2465 {
2466 png_memcpy(dp, v, 4);
2467 dp -= 4;
2468 }
2469 sptr -= 4;
2470 }
2471 }
2472 else if (pixel_bytes == 6)
2473 {
2474 for (i = width; i; i--)
2475 {
2476 png_byte v[8];
2477 int j;
2478 png_memcpy(v, sptr, 6);
2479 for (j = 0; j < png_pass_inc[pass]; j++)
2480 {
2481 png_memcpy(dp, v, 6);
2482 dp -= 6;
2483 }
2484 sptr -= 6;
2485 }
2486 }
2487 else if (pixel_bytes == 8)
2488 {
2489 for (i = width; i; i--)
2490 {
2491 png_byte v[8];
2492 int j;
2493 png_memcpy(v, sptr, 8);
2494 for (j = 0; j < png_pass_inc[pass]; j++)
2495 {
2496 png_memcpy(dp, v, 8);
2497 dp -= 8;
2498 }
2499 sptr -= 8;
2500 }
2501 }
2502 else // GRR: should never be reached
2503 {
2504 for (i = width; i; i--)
2505 {
2506 png_byte v[8];
2507 int j;
2508 png_memcpy(v, sptr, pixel_bytes);
2509 for (j = 0; j < png_pass_inc[pass]; j++)
2510 {
2511 png_memcpy(dp, v, pixel_bytes);
2512 dp -= pixel_bytes;
2513 }
2514 sptr -= pixel_bytes;
2515 }
2516 }
2518 } /* end if (MMX not supported) */
2519 break;
2520 }
2521 } /* end switch (row_info->pixel_depth) */
2523 row_info->width = final_width;
2524 row_info->rowbytes = ((final_width *
2525 (png_uint_32)row_info->pixel_depth + 7) >> 3);
2526 }
2528 } /* end png_do_read_interlace() */
2530 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2531 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2536 // These variables are utilized in the functions below. They are declared
2537 // globally here to ensure alignment on 8-byte boundaries.
2539 union uAll {
2540 long long use;
2541 double align;
2542 } _LBCarryMask = {0x0101010101010101LL},
2543 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2544 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2549 //===========================================================================//
2550 // //
2551 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2552 // //
2553 //===========================================================================//
2555 // Optimized code for PNG Average filter decoder
2557 static void /* PRIVATE */
2558 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2559 png_bytep prev_row)
2560 {
2561 int bpp;
2562 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2563 int dummy_value_S;
2564 int dummy_value_D;
2566 bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2567 _FullLength = row_info->rowbytes; // # of bytes to filter
2569 __asm__ __volatile__ (
2570 // initialize address pointers and offset
2571 #ifdef __PIC__
2572 "pushl %%ebx \n\t" // save index to Global Offset Table
2573 #endif
2574 //pre "movl row, %%edi \n\t" // edi: Avg(x)
2575 "xorl %%ebx, %%ebx \n\t" // ebx: x
2576 "movl %%edi, %%edx \n\t"
2577 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2578 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2579 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2581 "xorl %%eax,%%eax \n\t"
2583 // Compute the Raw value for the first bpp bytes
2584 // Raw(x) = Avg(x) + (Prior(x)/2)
2585 "avg_rlp: \n\t"
2586 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2587 "incl %%ebx \n\t"
2588 "shrb %%al \n\t" // divide by 2
2589 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2590 //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2591 "cmpl %%ecx, %%ebx \n\t"
2592 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2593 "jb avg_rlp \n\t" // mov does not affect flags
2595 // get # of bytes to alignment
2596 "movl %%edi, _dif \n\t" // take start of row
2597 "addl %%ebx, _dif \n\t" // add bpp
2598 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2599 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2600 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2601 "jz avg_go \n\t" // alignment
2603 // fix alignment
2604 // Compute the Raw value for the bytes up to the alignment boundary
2605 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2606 "xorl %%ecx, %%ecx \n\t"
2608 "avg_lp1: \n\t"
2609 "xorl %%eax, %%eax \n\t"
2610 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2611 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2612 "addw %%cx, %%ax \n\t"
2613 "incl %%ebx \n\t"
2614 "shrw %%ax \n\t" // divide by 2
2615 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2616 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2617 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2618 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2620 "avg_go: \n\t"
2621 "movl _FullLength, %%eax \n\t"
2622 "movl %%eax, %%ecx \n\t"
2623 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2624 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2625 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2626 "movl %%ecx, _MMXLength \n\t"
2627 #ifdef __PIC__
2628 "popl %%ebx \n\t" // restore index to Global Offset Table
2629 #endif
2631 : "=c" (dummy_value_c), // output regs (dummy)
2632 "=S" (dummy_value_S),
2633 "=D" (dummy_value_D)
2635 : "0" (bpp), // ecx // input regs
2636 "1" (prev_row), // esi
2637 "2" (row) // edi
2639 : "%eax", "%edx" // clobber list
2640 #ifndef __PIC__
2641 , "%ebx"
2642 #endif
2643 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2644 // (seems to work fine without...)
2645 );
2647 // now do the math for the rest of the row
2648 switch (bpp)
2649 {
2650 case 3:
2651 {
2652 _ActiveMask.use = 0x0000000000ffffffLL;
2653 _ShiftBpp.use = 24; // == 3 * 8
2654 _ShiftRem.use = 40; // == 64 - 24
2656 __asm__ __volatile__ (
2657 // re-init address pointers and offset
2658 "movq _ActiveMask, %%mm7 \n\t"
2659 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2660 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2661 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2662 "movq _HBClearMask, %%mm4 \n\t"
2663 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2665 // prime the pump: load the first Raw(x-bpp) data set
2666 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2667 // (correct pos. in loop below)
2668 "avg_3lp: \n\t"
2669 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2670 "movq %%mm5, %%mm3 \n\t"
2671 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp) data
2672 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2673 "movq %%mm7, %%mm6 \n\t"
2674 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2675 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2676 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2677 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2678 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2679 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2680 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2681 // lsb's were == 1 (only valid for active group)
2682 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2683 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2684 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2685 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1 bytes to add to Avg
2686 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2687 // byte
2688 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2689 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 3-5
2690 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2691 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2692 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2693 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2694 // lsb's were == 1 (only valid for active group)
2695 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2696 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2697 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2698 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2699 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2700 // byte
2702 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2703 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last two
2704 // bytes
2705 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2706 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2707 // Data only needs to be shifted once here to
2708 // get the correct x-bpp offset.
2709 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2710 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2711 // lsb's were == 1 (only valid for active group)
2712 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2713 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2714 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2715 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2716 "addl $8, %%ecx \n\t"
2717 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2718 // byte
2719 // now ready to write back to memory
2720 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2721 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2722 "cmpl _MMXLength, %%ecx \n\t"
2723 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2724 "jb avg_3lp \n\t"
2726 : "=S" (dummy_value_S), // output regs (dummy)
2727 "=D" (dummy_value_D)
2729 : "0" (prev_row), // esi // input regs
2730 "1" (row) // edi
2732 : "%ecx" // clobber list
2733 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2734 , "%mm0", "%mm1", "%mm2", "%mm3"
2735 , "%mm4", "%mm5", "%mm6", "%mm7"
2736 #endif
2737 );
2738 }
2739 break; // end 3 bpp
2741 case 6:
2742 case 4:
2743 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2744 //case 5: // GRR BOGUS
2745 {
2746 _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2747 // appropriate inactive bytes
2748 _ShiftBpp.use = bpp << 3;
2749 _ShiftRem.use = 64 - _ShiftBpp.use;
2751 __asm__ __volatile__ (
2752 "movq _HBClearMask, %%mm4 \n\t"
2754 // re-init address pointers and offset
2755 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment boundary
2757 // load _ActiveMask and clear all bytes except for 1st active group
2758 "movq _ActiveMask, %%mm7 \n\t"
2759 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2760 "psrlq _ShiftRem, %%mm7 \n\t"
2761 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2762 "movq %%mm7, %%mm6 \n\t"
2763 "movq _LBCarryMask, %%mm5 \n\t"
2764 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active group
2766 // prime the pump: load the first Raw(x-bpp) data set
2767 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2768 // (we correct pos. in loop below)
2769 "avg_4lp: \n\t"
2770 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2771 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
2772 "movq (%%esi,%%ecx,), %%mm1 \n\t"
2773 // add (Prev_row/2) to average
2774 "movq %%mm5, %%mm3 \n\t"
2775 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2776 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2777 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2778 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2779 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2780 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2781 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2782 // lsb's were == 1 (only valid for active group)
2783 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2784 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2785 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2786 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1 bytes to add to Avg
2787 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2788 // byte
2789 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2790 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2791 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2792 "addl $8, %%ecx \n\t"
2793 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2794 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2795 // lsb's were == 1 (only valid for active group)
2796 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2797 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2798 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2799 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2800 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2801 // byte
2802 "cmpl _MMXLength, %%ecx \n\t"
2803 // now ready to write back to memory
2804 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2805 // prep Raw(x-bpp) for next loop
2806 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2807 "jb avg_4lp \n\t"
2809 : "=S" (dummy_value_S), // output regs (dummy)
2810 "=D" (dummy_value_D)
2812 : "0" (prev_row), // esi // input regs
2813 "1" (row) // edi
2815 : "%ecx" // clobber list
2816 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2817 , "%mm0", "%mm1", "%mm2", "%mm3"
2818 , "%mm4", "%mm5", "%mm6", "%mm7"
2819 #endif
2820 );
2821 }
2822 break; // end 4,6 bpp
2824 case 2:
2825 {
2826 _ActiveMask.use = 0x000000000000ffffLL;
2827 _ShiftBpp.use = 16; // == 2 * 8
2828 _ShiftRem.use = 48; // == 64 - 16
2830 __asm__ __volatile__ (
2831 // load _ActiveMask
2832 "movq _ActiveMask, %%mm7 \n\t"
2833 // re-init address pointers and offset
2834 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment boundary
2835 "movq _LBCarryMask, %%mm5 \n\t"
2836 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2837 "movq _HBClearMask, %%mm4 \n\t"
2838 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2840 // prime the pump: load the first Raw(x-bpp) data set
2841 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2842 // (we correct pos. in loop below)
2843 "avg_2lp: \n\t"
2844 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2845 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
2846 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
2847 // add (Prev_row/2) to average
2848 "movq %%mm5, %%mm3 \n\t"
2849 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2850 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2851 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2852 "movq %%mm7, %%mm6 \n\t"
2853 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2855 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2856 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2857 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2858 // lsb's were == 1 (only valid for active group)
2859 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2860 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2861 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2862 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1 bytes to add to Avg
2863 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2865 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2866 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 2 & 3
2867 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2868 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2869 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2870 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2871 // lsb's were == 1 (only valid for active group)
2872 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2873 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2874 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2875 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2876 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2878 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2879 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 4 & 5
2880 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2881 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2882 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2883 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2884 // lsb's were == 1 (only valid for active group)
2885 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2886 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2887 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2888 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2889 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2891 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
2892 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 6 & 7
2893 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2894 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2895 "addl $8, %%ecx \n\t"
2896 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2897 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2898 // lsb's were == 1 (only valid for active group)
2899 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2900 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2901 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2902 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2903 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2905 "cmpl _MMXLength, %%ecx \n\t"
2906 // now ready to write back to memory
2907 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2908 // prep Raw(x-bpp) for next loop
2909 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2910 "jb avg_2lp \n\t"
2912 : "=S" (dummy_value_S), // output regs (dummy)
2913 "=D" (dummy_value_D)
2915 : "0" (prev_row), // esi // input regs
2916 "1" (row) // edi
2918 : "%ecx" // clobber list
2919 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2920 , "%mm0", "%mm1", "%mm2", "%mm3"
2921 , "%mm4", "%mm5", "%mm6", "%mm7"
2922 #endif
2923 );
2924 }
2925 break; // end 2 bpp
2927 case 1:
2928 {
2929 __asm__ __volatile__ (
2930 // re-init address pointers and offset
2931 #ifdef __PIC__
2932 "pushl %%ebx \n\t" // save Global Offset Table index
2933 #endif
2934 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment boundary
2935 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2936 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
2937 "jnb avg_1end \n\t"
2938 // do Paeth decode for remaining bytes
2939 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2940 "movl %%edi, %%edx \n\t"
2941 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2942 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2943 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
2944 // in loop below
2945 "avg_1lp: \n\t"
2946 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2947 "xorl %%eax, %%eax \n\t"
2948 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2949 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2950 "addw %%cx, %%ax \n\t"
2951 "incl %%ebx \n\t"
2952 "shrw %%ax \n\t" // divide by 2
2953 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2954 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
2955 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
2956 // mov does not affect flags; -1 to offset inc ebx
2957 "jb avg_1lp \n\t"
2959 "avg_1end: \n\t"
2960 #ifdef __PIC__
2961 "popl %%ebx \n\t" // Global Offset Table index
2962 #endif
2964 : "=c" (dummy_value_c), // output regs (dummy)
2965 "=S" (dummy_value_S),
2966 "=D" (dummy_value_D)
2968 : "0" (bpp), // ecx // input regs
2969 "1" (prev_row), // esi
2970 "2" (row) // edi
2972 : "%eax", "%edx" // clobber list
2973 #ifndef __PIC__
2974 , "%ebx"
2975 #endif
2976 );
2977 }
2978 return; // end 1 bpp
2980 case 8:
2981 {
2982 __asm__ __volatile__ (
2983 // re-init address pointers and offset
2984 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
2985 "movq _LBCarryMask, %%mm5 \n\t" // boundary
2986 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2987 "movq _HBClearMask, %%mm4 \n\t"
2988 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2990 // prime the pump: load the first Raw(x-bpp) data set
2991 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2992 // (NO NEED to correct pos. in loop below)
2994 "avg_8lp: \n\t"
2995 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2996 "movq %%mm5, %%mm3 \n\t"
2997 "movq (%%esi,%%ecx,), %%mm1 \n\t"
2998 "addl $8, %%ecx \n\t"
2999 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3000 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3001 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3002 // where both lsb's were == 1
3003 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3004 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3005 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3006 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3007 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3008 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3009 "cmpl _MMXLength, %%ecx \n\t"
3010 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3011 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3012 "jb avg_8lp \n\t"
3014 : "=S" (dummy_value_S), // output regs (dummy)
3015 "=D" (dummy_value_D)
3017 : "0" (prev_row), // esi // input regs
3018 "1" (row) // edi
3020 : "%ecx" // clobber list
3021 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3022 , "%mm0", "%mm1", "%mm2"
3023 , "%mm3", "%mm4", "%mm5"
3024 #endif
3025 );
3026 }
3027 break; // end 8 bpp
3029 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3030 {
3032 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3033 fprintf(stderr,
3034 "libpng: internal logic error (png_read_filter_row_mmx_avg())\n");
3036 #if 0
3037 __asm__ __volatile__ (
3038 "movq _LBCarryMask, %%mm5 \n\t"
3039 // re-init address pointers and offset
3040 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment boundary
3041 "movl row, %%edi \n\t" // edi: Avg(x)
3042 "movq _HBClearMask, %%mm4 \n\t"
3043 "movl %%edi, %%edx \n\t"
3044 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3045 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3046 "avg_Alp: \n\t"
3047 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3048 "movq %%mm5, %%mm3 \n\t"
3049 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3050 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3051 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3052 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3053 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte where both
3054 // lsb's were == 1
3055 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3056 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
3057 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each byte
3058 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
3059 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
3060 "addl $8, %%ebx \n\t"
3061 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each byte
3062 "cmpl _MMXLength, %%ebx \n\t"
3063 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3064 "jb avg_Alp \n\t"
3066 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3068 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3070 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3071 );
3072 #endif /* 0 - NEVER REACHED */
3073 }
3074 break;
3076 } // end switch (bpp)
3078 __asm__ __volatile__ (
3079 // MMX acceleration complete; now do clean-up
3080 // check if any remaining bytes left to decode
3081 #ifdef __PIC__
3082 "pushl %%ebx \n\t" // save index to Global Offset Table
3083 #endif
3084 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3085 //pre "movl row, %%edi \n\t" // edi: Avg(x)
3086 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3087 "jnb avg_end \n\t"
3089 // do Avg decode for remaining bytes
3090 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3091 "movl %%edi, %%edx \n\t"
3092 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3093 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3094 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3096 "avg_lp2: \n\t"
3097 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3098 "xorl %%eax, %%eax \n\t"
3099 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3100 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3101 "addw %%cx, %%ax \n\t"
3102 "incl %%ebx \n\t"
3103 "shrw %%ax \n\t" // divide by 2
3104 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3105 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3106 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3107 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3109 "avg_end: \n\t"
3110 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3111 #ifdef __PIC__
3112 "popl %%ebx \n\t" // restore index to Global Offset Table
3113 #endif
3115 : "=c" (dummy_value_c), // output regs (dummy)
3116 "=S" (dummy_value_S),
3117 "=D" (dummy_value_D)
3119 : "0" (bpp), // ecx // input regs
3120 "1" (prev_row), // esi
3121 "2" (row) // edi
3123 : "%eax", "%edx" // clobber list
3124 #ifndef __PIC__
3125 , "%ebx"
3126 #endif
3127 );
3129 } /* end png_read_filter_row_mmx_avg() */
3134 //===========================================================================//
3135 // //
3136 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3137 // //
3138 //===========================================================================//
3140 // Optimized code for PNG Paeth filter decoder
3142 static void /* PRIVATE */
3143 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3144 png_bytep prev_row)
3145 {
3146 int bpp;
3147 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3148 int dummy_value_S;
3149 int dummy_value_D;
3151 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3152 _FullLength = row_info->rowbytes; // # of bytes to filter
3154 __asm__ __volatile__ (
3155 #ifdef __PIC__
3156 "pushl %%ebx \n\t" // save index to Global Offset Table
3157 #endif
3158 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3159 //pre "movl row, %%edi \n\t"
3160 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3161 //pre "movl prev_row, %%esi \n\t"
3162 "xorl %%eax, %%eax \n\t"
3164 // Compute the Raw value for the first bpp bytes
3165 // Note: the formula works out to be always
3166 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3167 "paeth_rlp: \n\t"
3168 "movb (%%edi,%%ebx,), %%al \n\t"
3169 "addb (%%esi,%%ebx,), %%al \n\t"
3170 "incl %%ebx \n\t"
3171 //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3172 "cmpl %%ecx, %%ebx \n\t"
3173 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3174 "jb paeth_rlp \n\t"
3175 // get # of bytes to alignment
3176 "movl %%edi, _dif \n\t" // take start of row
3177 "addl %%ebx, _dif \n\t" // add bpp
3178 "xorl %%ecx, %%ecx \n\t"
3179 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment boundary
3180 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3181 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx at alignment
3182 "jz paeth_go \n\t"
3183 // fix alignment
3185 "paeth_lp1: \n\t"
3186 "xorl %%eax, %%eax \n\t"
3187 // pav = p - a = (a + b - c) - a = b - c
3188 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3189 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3190 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3191 "movl %%eax, _patemp \n\t" // Save pav for later use
3192 "xorl %%eax, %%eax \n\t"
3193 // pbv = p - b = (a + b - c) - b = a - c
3194 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3195 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3196 "movl %%eax, %%ecx \n\t"
3197 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3198 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3199 // pc = abs(pcv)
3200 "testl $0x80000000, %%eax \n\t"
3201 "jz paeth_pca \n\t"
3202 "negl %%eax \n\t" // reverse sign of neg values
3204 "paeth_pca: \n\t"
3205 "movl %%eax, _pctemp \n\t" // save pc for later use
3206 // pb = abs(pbv)
3207 "testl $0x80000000, %%ecx \n\t"
3208 "jz paeth_pba \n\t"
3209 "negl %%ecx \n\t" // reverse sign of neg values
3211 "paeth_pba: \n\t"
3212 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3213 // pa = abs(pav)
3214 "movl _patemp, %%eax \n\t"
3215 "testl $0x80000000, %%eax \n\t"
3216 "jz paeth_paa \n\t"
3217 "negl %%eax \n\t" // reverse sign of neg values
3219 "paeth_paa: \n\t"
3220 "movl %%eax, _patemp \n\t" // save pa for later use
3221 // test if pa <= pb
3222 "cmpl %%ecx, %%eax \n\t"
3223 "jna paeth_abb \n\t"
3224 // pa > pb; now test if pb <= pc
3225 "cmpl _pctemp, %%ecx \n\t"
3226 "jna paeth_bbc \n\t"
3227 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3228 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3229 "jmp paeth_paeth \n\t"
3231 "paeth_bbc: \n\t"
3232 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3233 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3234 "jmp paeth_paeth \n\t"
3236 "paeth_abb: \n\t"
3237 // pa <= pb; now test if pa <= pc
3238 "cmpl _pctemp, %%eax \n\t"
3239 "jna paeth_abc \n\t"
3240 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3241 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3242 "jmp paeth_paeth \n\t"
3244 "paeth_abc: \n\t"
3245 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3246 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3248 "paeth_paeth: \n\t"
3249 "incl %%ebx \n\t"
3250 "incl %%edx \n\t"
3251 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3252 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3253 "cmpl _dif, %%ebx \n\t"
3254 "jb paeth_lp1 \n\t"
3256 "paeth_go: \n\t"
3257 "movl _FullLength, %%ecx \n\t"
3258 "movl %%ecx, %%eax \n\t"
3259 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3260 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3261 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3262 "movl %%ecx, _MMXLength \n\t"
3263 #ifdef __PIC__
3264 "popl %%ebx \n\t" // restore index to Global Offset Table
3265 #endif
3267 : "=c" (dummy_value_c), // output regs (dummy)
3268 "=S" (dummy_value_S),
3269 "=D" (dummy_value_D)
3271 : "0" (bpp), // ecx // input regs
3272 "1" (prev_row), // esi
3273 "2" (row) // edi
3275 : "%eax", "%edx" // clobber list
3276 #ifndef __PIC__
3277 , "%ebx"
3278 #endif
3279 );
3281 // now do the math for the rest of the row
3282 switch (bpp)
3283 {
3284 case 3:
3285 {
3286 _ActiveMask.use = 0x0000000000ffffffLL;
3287 _ActiveMaskEnd.use = 0xffff000000000000LL;
3288 _ShiftBpp.use = 24; // == bpp(3) * 8
3289 _ShiftRem.use = 40; // == 64 - 24
3291 __asm__ __volatile__ (
3292 "movl _dif, %%ecx \n\t"
3293 // preload "movl row, %%edi \n\t"
3294 // preload "movl prev_row, %%esi \n\t"
3295 "pxor %%mm0, %%mm0 \n\t"
3296 // prime the pump: load the first Raw(x-bpp) data set
3297 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3298 "paeth_3lp: \n\t"
3299 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st 3 bytes
3300 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3301 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3302 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3303 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3304 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st 3 bytes
3305 // pav = p - a = (a + b - c) - a = b - c
3306 "movq %%mm2, %%mm4 \n\t"
3307 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3308 // pbv = p - b = (a + b - c) - b = a - c
3309 "movq %%mm1, %%mm5 \n\t"
3310 "psubw %%mm3, %%mm4 \n\t"
3311 "pxor %%mm7, %%mm7 \n\t"
3312 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3313 "movq %%mm4, %%mm6 \n\t"
3314 "psubw %%mm3, %%mm5 \n\t"
3316 // pa = abs(p-a) = abs(pav)
3317 // pb = abs(p-b) = abs(pbv)
3318 // pc = abs(p-c) = abs(pcv)
3319 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3320 "paddw %%mm5, %%mm6 \n\t"
3321 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3322 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3323 "psubw %%mm0, %%mm4 \n\t"
3324 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3325 "psubw %%mm0, %%mm4 \n\t"
3326 "psubw %%mm7, %%mm5 \n\t"
3327 "pxor %%mm0, %%mm0 \n\t"
3328 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3329 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3330 "psubw %%mm7, %%mm5 \n\t"
3331 "psubw %%mm0, %%mm6 \n\t"
3332 // test pa <= pb
3333 "movq %%mm4, %%mm7 \n\t"
3334 "psubw %%mm0, %%mm6 \n\t"
3335 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3336 "movq %%mm7, %%mm0 \n\t"
3337 // use mm7 mask to merge pa & pb
3338 "pand %%mm7, %%mm5 \n\t"
3339 // use mm0 mask copy to merge a & b
3340 "pand %%mm0, %%mm2 \n\t"
3341 "pandn %%mm4, %%mm7 \n\t"
3342 "pandn %%mm1, %%mm0 \n\t"
3343 "paddw %%mm5, %%mm7 \n\t"
3344 "paddw %%mm2, %%mm0 \n\t"
3345 // test ((pa <= pb)? pa:pb) <= pc
3346 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3347 "pxor %%mm1, %%mm1 \n\t"
3348 "pand %%mm7, %%mm3 \n\t"
3349 "pandn %%mm0, %%mm7 \n\t"
3350 "paddw %%mm3, %%mm7 \n\t"
3351 "pxor %%mm0, %%mm0 \n\t"
3352 "packuswb %%mm1, %%mm7 \n\t"
3353 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3354 "pand _ActiveMask, %%mm7 \n\t"
3355 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3356 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3357 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3358 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3359 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
3360 // now do Paeth for 2nd set of bytes (3-5)
3361 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3362 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3363 "pxor %%mm7, %%mm7 \n\t"
3364 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3365 // pbv = p - b = (a + b - c) - b = a - c
3366 "movq %%mm1, %%mm5 \n\t"
3367 // pav = p - a = (a + b - c) - a = b - c
3368 "movq %%mm2, %%mm4 \n\t"
3369 "psubw %%mm3, %%mm5 \n\t"
3370 "psubw %%mm3, %%mm4 \n\t"
3371 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3372 // pav + pbv = pbv + pav
3373 "movq %%mm5, %%mm6 \n\t"
3374 "paddw %%mm4, %%mm6 \n\t"
3376 // pa = abs(p-a) = abs(pav)
3377 // pb = abs(p-b) = abs(pbv)
3378 // pc = abs(p-c) = abs(pcv)
3379 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3380 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3381 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3382 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3383 "psubw %%mm0, %%mm5 \n\t"
3384 "psubw %%mm7, %%mm4 \n\t"
3385 "psubw %%mm0, %%mm5 \n\t"
3386 "psubw %%mm7, %%mm4 \n\t"
3387 "pxor %%mm0, %%mm0 \n\t"
3388 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3389 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3390 "psubw %%mm0, %%mm6 \n\t"
3391 // test pa <= pb
3392 "movq %%mm4, %%mm7 \n\t"
3393 "psubw %%mm0, %%mm6 \n\t"
3394 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3395 "movq %%mm7, %%mm0 \n\t"
3396 // use mm7 mask to merge pa & pb
3397 "pand %%mm7, %%mm5 \n\t"
3398 // use mm0 mask copy to merge a & b
3399 "pand %%mm0, %%mm2 \n\t"
3400 "pandn %%mm4, %%mm7 \n\t"
3401 "pandn %%mm1, %%mm0 \n\t"
3402 "paddw %%mm5, %%mm7 \n\t"
3403 "paddw %%mm2, %%mm0 \n\t"
3404 // test ((pa <= pb)? pa:pb) <= pc
3405 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3406 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3407 "pand %%mm7, %%mm3 \n\t"
3408 "pandn %%mm0, %%mm7 \n\t"
3409 "pxor %%mm1, %%mm1 \n\t"
3410 "paddw %%mm3, %%mm7 \n\t"
3411 "pxor %%mm0, %%mm0 \n\t"
3412 "packuswb %%mm1, %%mm7 \n\t"
3413 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3414 "pand _ActiveMask, %%mm7 \n\t"
3415 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3416 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of 3 bytes
3417 // pav = p - a = (a + b - c) - a = b - c
3418 "movq %%mm2, %%mm4 \n\t"
3419 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3420 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3421 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3422 "movq %%mm7, %%mm1 \n\t"
3423 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3424 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3425 // now mm1 will be used as Raw(x-bpp)
3426 // now do Paeth for 3rd, and final, set of bytes (6-7)
3427 "pxor %%mm7, %%mm7 \n\t"
3428 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3429 "psubw %%mm3, %%mm4 \n\t"
3430 // pbv = p - b = (a + b - c) - b = a - c
3431 "movq %%mm1, %%mm5 \n\t"
3432 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3433 "movq %%mm4, %%mm6 \n\t"
3434 "psubw %%mm3, %%mm5 \n\t"
3435 "pxor %%mm0, %%mm0 \n\t"
3436 "paddw %%mm5, %%mm6 \n\t"
3438 // pa = abs(p-a) = abs(pav)
3439 // pb = abs(p-b) = abs(pbv)
3440 // pc = abs(p-c) = abs(pcv)
3441 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3442 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3443 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3444 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3445 "psubw %%mm0, %%mm4 \n\t"
3446 "psubw %%mm7, %%mm5 \n\t"
3447 "psubw %%mm0, %%mm4 \n\t"
3448 "psubw %%mm7, %%mm5 \n\t"
3449 "pxor %%mm0, %%mm0 \n\t"
3450 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3451 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3452 "psubw %%mm0, %%mm6 \n\t"
3453 // test pa <= pb
3454 "movq %%mm4, %%mm7 \n\t"
3455 "psubw %%mm0, %%mm6 \n\t"
3456 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3457 "movq %%mm7, %%mm0 \n\t"
3458 // use mm0 mask copy to merge a & b
3459 "pand %%mm0, %%mm2 \n\t"
3460 // use mm7 mask to merge pa & pb
3461 "pand %%mm7, %%mm5 \n\t"
3462 "pandn %%mm1, %%mm0 \n\t"
3463 "pandn %%mm4, %%mm7 \n\t"
3464 "paddw %%mm2, %%mm0 \n\t"
3465 "paddw %%mm5, %%mm7 \n\t"
3466 // test ((pa <= pb)? pa:pb) <= pc
3467 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3468 "pand %%mm7, %%mm3 \n\t"
3469 "pandn %%mm0, %%mm7 \n\t"
3470 "paddw %%mm3, %%mm7 \n\t"
3471 "pxor %%mm1, %%mm1 \n\t"
3472 "packuswb %%mm7, %%mm1 \n\t"
3473 // step ecx to next set of 8 bytes and repeat loop til done
3474 "addl $8, %%ecx \n\t"
3475 "pand _ActiveMaskEnd, %%mm1 \n\t"
3476 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3478 "cmpl _MMXLength, %%ecx \n\t"
3479 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3480 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3481 // mm1 will be used as Raw(x-bpp) next loop
3482 // mm3 ready to be used as Prior(x-bpp) next loop
3483 "jb paeth_3lp \n\t"
3485 : "=S" (dummy_value_S), // output regs (dummy)
3486 "=D" (dummy_value_D)
3488 : "0" (prev_row), // esi // input regs
3489 "1" (row) // edi
3491 : "%ecx" // clobber list
3492 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3493 , "%mm0", "%mm1", "%mm2", "%mm3"
3494 , "%mm4", "%mm5", "%mm6", "%mm7"
3495 #endif
3496 );
3497 }
3498 break; // end 3 bpp
3500 case 6:
3501 //case 7: // GRR BOGUS
3502 //case 5: // GRR BOGUS
3503 {
3504 _ActiveMask.use = 0x00000000ffffffffLL;
3505 _ActiveMask2.use = 0xffffffff00000000LL;
3506 _ShiftBpp.use = bpp << 3; // == bpp * 8
3507 _ShiftRem.use = 64 - _ShiftBpp.use;
3509 __asm__ __volatile__ (
3510 "movl _dif, %%ecx \n\t"
3511 // preload "movl row, %%edi \n\t"
3512 // preload "movl prev_row, %%esi \n\t"
3513 // prime the pump: load the first Raw(x-bpp) data set
3514 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3515 "pxor %%mm0, %%mm0 \n\t"
3517 "paeth_6lp: \n\t"
3518 // must shift to position Raw(x-bpp) data
3519 "psrlq _ShiftRem, %%mm1 \n\t"
3520 // do first set of 4 bytes
3521 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3522 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3523 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3524 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3525 // must shift to position Prior(x-bpp) data
3526 "psrlq _ShiftRem, %%mm3 \n\t"
3527 // pav = p - a = (a + b - c) - a = b - c
3528 "movq %%mm2, %%mm4 \n\t"
3529 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3530 // pbv = p - b = (a + b - c) - b = a - c
3531 "movq %%mm1, %%mm5 \n\t"
3532 "psubw %%mm3, %%mm4 \n\t"
3533 "pxor %%mm7, %%mm7 \n\t"
3534 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3535 "movq %%mm4, %%mm6 \n\t"
3536 "psubw %%mm3, %%mm5 \n\t"
3537 // pa = abs(p-a) = abs(pav)
3538 // pb = abs(p-b) = abs(pbv)
3539 // pc = abs(p-c) = abs(pcv)
3540 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3541 "paddw %%mm5, %%mm6 \n\t"
3542 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3543 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3544 "psubw %%mm0, %%mm4 \n\t"
3545 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3546 "psubw %%mm0, %%mm4 \n\t"
3547 "psubw %%mm7, %%mm5 \n\t"
3548 "pxor %%mm0, %%mm0 \n\t"
3549 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3550 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3551 "psubw %%mm7, %%mm5 \n\t"
3552 "psubw %%mm0, %%mm6 \n\t"
3553 // test pa <= pb
3554 "movq %%mm4, %%mm7 \n\t"
3555 "psubw %%mm0, %%mm6 \n\t"
3556 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3557 "movq %%mm7, %%mm0 \n\t"
3558 // use mm7 mask to merge pa & pb
3559 "pand %%mm7, %%mm5 \n\t"
3560 // use mm0 mask copy to merge a & b
3561 "pand %%mm0, %%mm2 \n\t"
3562 "pandn %%mm4, %%mm7 \n\t"
3563 "pandn %%mm1, %%mm0 \n\t"
3564 "paddw %%mm5, %%mm7 \n\t"
3565 "paddw %%mm2, %%mm0 \n\t"
3566 // test ((pa <= pb)? pa:pb) <= pc
3567 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3568 "pxor %%mm1, %%mm1 \n\t"
3569 "pand %%mm7, %%mm3 \n\t"
3570 "pandn %%mm0, %%mm7 \n\t"
3571 "paddw %%mm3, %%mm7 \n\t"
3572 "pxor %%mm0, %%mm0 \n\t"
3573 "packuswb %%mm1, %%mm7 \n\t"
3574 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3575 "pand _ActiveMask, %%mm7 \n\t"
3576 "psrlq _ShiftRem, %%mm3 \n\t"
3577 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3578 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3579 "movq %%mm2, %%mm6 \n\t"
3580 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3581 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3582 "psllq _ShiftBpp, %%mm6 \n\t"
3583 "movq %%mm7, %%mm5 \n\t"
3584 "psrlq _ShiftRem, %%mm1 \n\t"
3585 "por %%mm6, %%mm3 \n\t"
3586 "psllq _ShiftBpp, %%mm5 \n\t"
3587 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3588 "por %%mm5, %%mm1 \n\t"
3589 // do second set of 4 bytes
3590 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3591 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3592 // pav = p - a = (a + b - c) - a = b - c
3593 "movq %%mm2, %%mm4 \n\t"
3594 // pbv = p - b = (a + b - c) - b = a - c
3595 "movq %%mm1, %%mm5 \n\t"
3596 "psubw %%mm3, %%mm4 \n\t"
3597 "pxor %%mm7, %%mm7 \n\t"
3598 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3599 "movq %%mm4, %%mm6 \n\t"
3600 "psubw %%mm3, %%mm5 \n\t"
3601 // pa = abs(p-a) = abs(pav)
3602 // pb = abs(p-b) = abs(pbv)
3603 // pc = abs(p-c) = abs(pcv)
3604 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3605 "paddw %%mm5, %%mm6 \n\t"
3606 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3607 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3608 "psubw %%mm0, %%mm4 \n\t"
3609 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3610 "psubw %%mm0, %%mm4 \n\t"
3611 "psubw %%mm7, %%mm5 \n\t"
3612 "pxor %%mm0, %%mm0 \n\t"
3613 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3614 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3615 "psubw %%mm7, %%mm5 \n\t"
3616 "psubw %%mm0, %%mm6 \n\t"
3617 // test pa <= pb
3618 "movq %%mm4, %%mm7 \n\t"
3619 "psubw %%mm0, %%mm6 \n\t"
3620 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3621 "movq %%mm7, %%mm0 \n\t"
3622 // use mm7 mask to merge pa & pb
3623 "pand %%mm7, %%mm5 \n\t"
3624 // use mm0 mask copy to merge a & b
3625 "pand %%mm0, %%mm2 \n\t"
3626 "pandn %%mm4, %%mm7 \n\t"
3627 "pandn %%mm1, %%mm0 \n\t"
3628 "paddw %%mm5, %%mm7 \n\t"
3629 "paddw %%mm2, %%mm0 \n\t"
3630 // test ((pa <= pb)? pa:pb) <= pc
3631 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3632 "pxor %%mm1, %%mm1 \n\t"
3633 "pand %%mm7, %%mm3 \n\t"
3634 "pandn %%mm0, %%mm7 \n\t"
3635 "pxor %%mm1, %%mm1 \n\t"
3636 "paddw %%mm3, %%mm7 \n\t"
3637 "pxor %%mm0, %%mm0 \n\t"
3638 // step ecx to next set of 8 bytes and repeat loop til done
3639 "addl $8, %%ecx \n\t"
3640 "packuswb %%mm7, %%mm1 \n\t"
3641 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3642 "cmpl _MMXLength, %%ecx \n\t"
3643 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3644 // mm1 will be used as Raw(x-bpp) next loop
3645 "jb paeth_6lp \n\t"
3647 : "=S" (dummy_value_S), // output regs (dummy)
3648 "=D" (dummy_value_D)
3650 : "0" (prev_row), // esi // input regs
3651 "1" (row) // edi
3653 : "%ecx" // clobber list
3654 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3655 , "%mm0", "%mm1", "%mm2", "%mm3"
3656 , "%mm4", "%mm5", "%mm6", "%mm7"
3657 #endif
3658 );
3659 }
3660 break; // end 6 bpp
3662 case 4:
3663 {
3664 _ActiveMask.use = 0x00000000ffffffffLL;
3666 __asm__ __volatile__ (
3667 "movl _dif, %%ecx \n\t"
3668 // preload "movl row, %%edi \n\t"
3669 // preload "movl prev_row, %%esi \n\t"
3670 "pxor %%mm0, %%mm0 \n\t"
3671 // prime the pump: load the first Raw(x-bpp) data set
3672 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3673 // a=Raw(x-bpp) bytes
3674 "paeth_4lp: \n\t"
3675 // do first set of 4 bytes
3676 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3677 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3678 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3679 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3680 // pav = p - a = (a + b - c) - a = b - c
3681 "movq %%mm2, %%mm4 \n\t"
3682 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3683 // pbv = p - b = (a + b - c) - b = a - c
3684 "movq %%mm1, %%mm5 \n\t"
3685 "psubw %%mm3, %%mm4 \n\t"
3686 "pxor %%mm7, %%mm7 \n\t"
3687 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3688 "movq %%mm4, %%mm6 \n\t"
3689 "psubw %%mm3, %%mm5 \n\t"
3690 // pa = abs(p-a) = abs(pav)
3691 // pb = abs(p-b) = abs(pbv)
3692 // pc = abs(p-c) = abs(pcv)
3693 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3694 "paddw %%mm5, %%mm6 \n\t"
3695 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3696 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3697 "psubw %%mm0, %%mm4 \n\t"
3698 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3699 "psubw %%mm0, %%mm4 \n\t"
3700 "psubw %%mm7, %%mm5 \n\t"
3701 "pxor %%mm0, %%mm0 \n\t"
3702 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3703 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3704 "psubw %%mm7, %%mm5 \n\t"
3705 "psubw %%mm0, %%mm6 \n\t"
3706 // test pa <= pb
3707 "movq %%mm4, %%mm7 \n\t"
3708 "psubw %%mm0, %%mm6 \n\t"
3709 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3710 "movq %%mm7, %%mm0 \n\t"
3711 // use mm7 mask to merge pa & pb
3712 "pand %%mm7, %%mm5 \n\t"
3713 // use mm0 mask copy to merge a & b
3714 "pand %%mm0, %%mm2 \n\t"
3715 "pandn %%mm4, %%mm7 \n\t"
3716 "pandn %%mm1, %%mm0 \n\t"
3717 "paddw %%mm5, %%mm7 \n\t"
3718 "paddw %%mm2, %%mm0 \n\t"
3719 // test ((pa <= pb)? pa:pb) <= pc
3720 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3721 "pxor %%mm1, %%mm1 \n\t"
3722 "pand %%mm7, %%mm3 \n\t"
3723 "pandn %%mm0, %%mm7 \n\t"
3724 "paddw %%mm3, %%mm7 \n\t"
3725 "pxor %%mm0, %%mm0 \n\t"
3726 "packuswb %%mm1, %%mm7 \n\t"
3727 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3728 "pand _ActiveMask, %%mm7 \n\t"
3729 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3730 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3731 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3732 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3733 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
3734 // do second set of 4 bytes
3735 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3736 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3737 // pav = p - a = (a + b - c) - a = b - c
3738 "movq %%mm2, %%mm4 \n\t"
3739 // pbv = p - b = (a + b - c) - b = a - c
3740 "movq %%mm1, %%mm5 \n\t"
3741 "psubw %%mm3, %%mm4 \n\t"
3742 "pxor %%mm7, %%mm7 \n\t"
3743 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3744 "movq %%mm4, %%mm6 \n\t"
3745 "psubw %%mm3, %%mm5 \n\t"
3746 // pa = abs(p-a) = abs(pav)
3747 // pb = abs(p-b) = abs(pbv)
3748 // pc = abs(p-c) = abs(pcv)
3749 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3750 "paddw %%mm5, %%mm6 \n\t"
3751 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3752 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3753 "psubw %%mm0, %%mm4 \n\t"
3754 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3755 "psubw %%mm0, %%mm4 \n\t"
3756 "psubw %%mm7, %%mm5 \n\t"
3757 "pxor %%mm0, %%mm0 \n\t"
3758 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3759 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3760 "psubw %%mm7, %%mm5 \n\t"
3761 "psubw %%mm0, %%mm6 \n\t"
3762 // test pa <= pb
3763 "movq %%mm4, %%mm7 \n\t"
3764 "psubw %%mm0, %%mm6 \n\t"
3765 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3766 "movq %%mm7, %%mm0 \n\t"
3767 // use mm7 mask to merge pa & pb
3768 "pand %%mm7, %%mm5 \n\t"
3769 // use mm0 mask copy to merge a & b
3770 "pand %%mm0, %%mm2 \n\t"
3771 "pandn %%mm4, %%mm7 \n\t"
3772 "pandn %%mm1, %%mm0 \n\t"
3773 "paddw %%mm5, %%mm7 \n\t"
3774 "paddw %%mm2, %%mm0 \n\t"
3775 // test ((pa <= pb)? pa:pb) <= pc
3776 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3777 "pxor %%mm1, %%mm1 \n\t"
3778 "pand %%mm7, %%mm3 \n\t"
3779 "pandn %%mm0, %%mm7 \n\t"
3780 "pxor %%mm1, %%mm1 \n\t"
3781 "paddw %%mm3, %%mm7 \n\t"
3782 "pxor %%mm0, %%mm0 \n\t"
3783 // step ecx to next set of 8 bytes and repeat loop til done
3784 "addl $8, %%ecx \n\t"
3785 "packuswb %%mm7, %%mm1 \n\t"
3786 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
3787 "cmpl _MMXLength, %%ecx \n\t"
3788 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3789 // mm1 will be used as Raw(x-bpp) next loop
3790 "jb paeth_4lp \n\t"
3792 : "=S" (dummy_value_S), // output regs (dummy)
3793 "=D" (dummy_value_D)
3795 : "0" (prev_row), // esi // input regs
3796 "1" (row) // edi
3798 : "%ecx" // clobber list
3799 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3800 , "%mm0", "%mm1", "%mm2", "%mm3"
3801 , "%mm4", "%mm5", "%mm6", "%mm7"
3802 #endif
3803 );
3804 }
3805 break; // end 4 bpp
3807 case 8: // bpp == 8
3808 {
3809 _ActiveMask.use = 0x00000000ffffffffLL;
3811 __asm__ __volatile__ (
3812 "movl _dif, %%ecx \n\t"
3813 // preload "movl row, %%edi \n\t"
3814 // preload "movl prev_row, %%esi \n\t"
3815 "pxor %%mm0, %%mm0 \n\t"
3816 // prime the pump: load the first Raw(x-bpp) data set
3817 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3818 // a=Raw(x-bpp) bytes
3819 "paeth_8lp: \n\t"
3820 // do first set of 4 bytes
3821 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3822 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3823 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3824 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3825 // pav = p - a = (a + b - c) - a = b - c
3826 "movq %%mm2, %%mm4 \n\t"
3827 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3828 // pbv = p - b = (a + b - c) - b = a - c
3829 "movq %%mm1, %%mm5 \n\t"
3830 "psubw %%mm3, %%mm4 \n\t"
3831 "pxor %%mm7, %%mm7 \n\t"
3832 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3833 "movq %%mm4, %%mm6 \n\t"
3834 "psubw %%mm3, %%mm5 \n\t"
3835 // pa = abs(p-a) = abs(pav)
3836 // pb = abs(p-b) = abs(pbv)
3837 // pc = abs(p-c) = abs(pcv)
3838 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3839 "paddw %%mm5, %%mm6 \n\t"
3840 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3841 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3842 "psubw %%mm0, %%mm4 \n\t"
3843 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3844 "psubw %%mm0, %%mm4 \n\t"
3845 "psubw %%mm7, %%mm5 \n\t"
3846 "pxor %%mm0, %%mm0 \n\t"
3847 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3848 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3849 "psubw %%mm7, %%mm5 \n\t"
3850 "psubw %%mm0, %%mm6 \n\t"
3851 // test pa <= pb
3852 "movq %%mm4, %%mm7 \n\t"
3853 "psubw %%mm0, %%mm6 \n\t"
3854 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3855 "movq %%mm7, %%mm0 \n\t"
3856 // use mm7 mask to merge pa & pb
3857 "pand %%mm7, %%mm5 \n\t"
3858 // use mm0 mask copy to merge a & b
3859 "pand %%mm0, %%mm2 \n\t"
3860 "pandn %%mm4, %%mm7 \n\t"
3861 "pandn %%mm1, %%mm0 \n\t"
3862 "paddw %%mm5, %%mm7 \n\t"
3863 "paddw %%mm2, %%mm0 \n\t"
3864 // test ((pa <= pb)? pa:pb) <= pc
3865 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3866 "pxor %%mm1, %%mm1 \n\t"
3867 "pand %%mm7, %%mm3 \n\t"
3868 "pandn %%mm0, %%mm7 \n\t"
3869 "paddw %%mm3, %%mm7 \n\t"
3870 "pxor %%mm0, %%mm0 \n\t"
3871 "packuswb %%mm1, %%mm7 \n\t"
3872 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3873 "pand _ActiveMask, %%mm7 \n\t"
3874 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3875 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3876 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3877 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3878 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
3880 // do second set of 4 bytes
3881 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3882 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3883 // pav = p - a = (a + b - c) - a = b - c
3884 "movq %%mm2, %%mm4 \n\t"
3885 // pbv = p - b = (a + b - c) - b = a - c
3886 "movq %%mm1, %%mm5 \n\t"
3887 "psubw %%mm3, %%mm4 \n\t"
3888 "pxor %%mm7, %%mm7 \n\t"
3889 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3890 "movq %%mm4, %%mm6 \n\t"
3891 "psubw %%mm3, %%mm5 \n\t"
3892 // pa = abs(p-a) = abs(pav)
3893 // pb = abs(p-b) = abs(pbv)
3894 // pc = abs(p-c) = abs(pcv)
3895 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3896 "paddw %%mm5, %%mm6 \n\t"
3897 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3898 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3899 "psubw %%mm0, %%mm4 \n\t"
3900 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3901 "psubw %%mm0, %%mm4 \n\t"
3902 "psubw %%mm7, %%mm5 \n\t"
3903 "pxor %%mm0, %%mm0 \n\t"
3904 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3905 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3906 "psubw %%mm7, %%mm5 \n\t"
3907 "psubw %%mm0, %%mm6 \n\t"
3908 // test pa <= pb
3909 "movq %%mm4, %%mm7 \n\t"
3910 "psubw %%mm0, %%mm6 \n\t"
3911 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3912 "movq %%mm7, %%mm0 \n\t"
3913 // use mm7 mask to merge pa & pb
3914 "pand %%mm7, %%mm5 \n\t"
3915 // use mm0 mask copy to merge a & b
3916 "pand %%mm0, %%mm2 \n\t"
3917 "pandn %%mm4, %%mm7 \n\t"
3918 "pandn %%mm1, %%mm0 \n\t"
3919 "paddw %%mm5, %%mm7 \n\t"
3920 "paddw %%mm2, %%mm0 \n\t"
3921 // test ((pa <= pb)? pa:pb) <= pc
3922 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3923 "pxor %%mm1, %%mm1 \n\t"
3924 "pand %%mm7, %%mm3 \n\t"
3925 "pandn %%mm0, %%mm7 \n\t"
3926 "pxor %%mm1, %%mm1 \n\t"
3927 "paddw %%mm3, %%mm7 \n\t"
3928 "pxor %%mm0, %%mm0 \n\t"
3929 // step ecx to next set of 8 bytes and repeat loop til done
3930 "addl $8, %%ecx \n\t"
3931 "packuswb %%mm7, %%mm1 \n\t"
3932 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3933 "cmpl _MMXLength, %%ecx \n\t"
3934 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3935 // mm1 will be used as Raw(x-bpp) next loop
3936 "jb paeth_8lp \n\t"
3938 : "=S" (dummy_value_S), // output regs (dummy)
3939 "=D" (dummy_value_D)
3941 : "0" (prev_row), // esi // input regs
3942 "1" (row) // edi
3944 : "%ecx" // clobber list
3945 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3946 , "%mm0", "%mm1", "%mm2", "%mm3"
3947 , "%mm4", "%mm5", "%mm6", "%mm7"
3948 #endif
3949 );
3950 }
3951 break; // end 8 bpp
3953 case 1: // bpp = 1
3954 case 2: // bpp = 2
3955 default: // bpp > 8
3956 {
3957 __asm__ __volatile__ (
3958 #ifdef __PIC__
3959 "pushl %%ebx \n\t" // save Global Offset Table index
3960 #endif
3961 "movl _dif, %%ebx \n\t"
3962 "cmpl _FullLength, %%ebx \n\t"
3963 "jnb paeth_dend \n\t"
3965 // preload "movl row, %%edi \n\t"
3966 // preload "movl prev_row, %%esi \n\t"
3967 // do Paeth decode for remaining bytes
3968 "movl %%ebx, %%edx \n\t"
3969 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3970 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
3971 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3973 "paeth_dlp: \n\t"
3974 "xorl %%eax, %%eax \n\t"
3975 // pav = p - a = (a + b - c) - a = b - c
3976 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3977 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3978 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3979 "movl %%eax, _patemp \n\t" // Save pav for later use
3980 "xorl %%eax, %%eax \n\t"
3981 // pbv = p - b = (a + b - c) - b = a - c
3982 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3983 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3984 "movl %%eax, %%ecx \n\t"
3985 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3986 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3987 // pc = abs(pcv)
3988 "testl $0x80000000, %%eax \n\t"
3989 "jz paeth_dpca \n\t"
3990 "negl %%eax \n\t" // reverse sign of neg values
3992 "paeth_dpca: \n\t"
3993 "movl %%eax, _pctemp \n\t" // save pc for later use
3994 // pb = abs(pbv)
3995 "testl $0x80000000, %%ecx \n\t"
3996 "jz paeth_dpba \n\t"
3997 "negl %%ecx \n\t" // reverse sign of neg values
3999 "paeth_dpba: \n\t"
4000 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4001 // pa = abs(pav)
4002 "movl _patemp, %%eax \n\t"
4003 "testl $0x80000000, %%eax \n\t"
4004 "jz paeth_dpaa \n\t"
4005 "negl %%eax \n\t" // reverse sign of neg values
4007 "paeth_dpaa: \n\t"
4008 "movl %%eax, _patemp \n\t" // save pa for later use
4009 // test if pa <= pb
4010 "cmpl %%ecx, %%eax \n\t"
4011 "jna paeth_dabb \n\t"
4012 // pa > pb; now test if pb <= pc
4013 "cmpl _pctemp, %%ecx \n\t"
4014 "jna paeth_dbbc \n\t"
4015 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4016 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4017 "jmp paeth_dpaeth \n\t"
4019 "paeth_dbbc: \n\t"
4020 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4021 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4022 "jmp paeth_dpaeth \n\t"
4024 "paeth_dabb: \n\t"
4025 // pa <= pb; now test if pa <= pc
4026 "cmpl _pctemp, %%eax \n\t"
4027 "jna paeth_dabc \n\t"
4028 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4029 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4030 "jmp paeth_dpaeth \n\t"
4032 "paeth_dabc: \n\t"
4033 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4034 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4036 "paeth_dpaeth: \n\t"
4037 "incl %%ebx \n\t"
4038 "incl %%edx \n\t"
4039 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4040 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4041 "cmpl _FullLength, %%ebx \n\t"
4042 "jb paeth_dlp \n\t"
4044 "paeth_dend: \n\t"
4045 #ifdef __PIC__
4046 "popl %%ebx \n\t" // index to Global Offset Table
4047 #endif
4049 : "=c" (dummy_value_c), // output regs (dummy)
4050 "=S" (dummy_value_S),
4051 "=D" (dummy_value_D)
4053 : "0" (bpp), // ecx // input regs
4054 "1" (prev_row), // esi
4055 "2" (row) // edi
4057 : "%eax", "%edx" // clobber list
4058 #ifndef __PIC__
4059 , "%ebx"
4060 #endif
4061 );
4062 }
4063 return; // No need to go further with this one
4065 } // end switch (bpp)
4067 __asm__ __volatile__ (
4068 // MMX acceleration complete; now do clean-up
4069 // check if any remaining bytes left to decode
4070 #ifdef __PIC__
4071 "pushl %%ebx \n\t" // save index to Global Offset Table
4072 #endif
4073 "movl _MMXLength, %%ebx \n\t"
4074 "cmpl _FullLength, %%ebx \n\t"
4075 "jnb paeth_end \n\t"
4076 //pre "movl row, %%edi \n\t"
4077 //pre "movl prev_row, %%esi \n\t"
4078 // do Paeth decode for remaining bytes
4079 "movl %%ebx, %%edx \n\t"
4080 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4081 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4082 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4084 "paeth_lp2: \n\t"
4085 "xorl %%eax, %%eax \n\t"
4086 // pav = p - a = (a + b - c) - a = b - c
4087 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4088 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4089 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4090 "movl %%eax, _patemp \n\t" // Save pav for later use
4091 "xorl %%eax, %%eax \n\t"
4092 // pbv = p - b = (a + b - c) - b = a - c
4093 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4094 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4095 "movl %%eax, %%ecx \n\t"
4096 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4097 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4098 // pc = abs(pcv)
4099 "testl $0x80000000, %%eax \n\t"
4100 "jz paeth_pca2 \n\t"
4101 "negl %%eax \n\t" // reverse sign of neg values
4103 "paeth_pca2: \n\t"
4104 "movl %%eax, _pctemp \n\t" // save pc for later use
4105 // pb = abs(pbv)
4106 "testl $0x80000000, %%ecx \n\t"
4107 "jz paeth_pba2 \n\t"
4108 "negl %%ecx \n\t" // reverse sign of neg values
4110 "paeth_pba2: \n\t"
4111 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4112 // pa = abs(pav)
4113 "movl _patemp, %%eax \n\t"
4114 "testl $0x80000000, %%eax \n\t"
4115 "jz paeth_paa2 \n\t"
4116 "negl %%eax \n\t" // reverse sign of neg values
4118 "paeth_paa2: \n\t"
4119 "movl %%eax, _patemp \n\t" // save pa for later use
4120 // test if pa <= pb
4121 "cmpl %%ecx, %%eax \n\t"
4122 "jna paeth_abb2 \n\t"
4123 // pa > pb; now test if pb <= pc
4124 "cmpl _pctemp, %%ecx \n\t"
4125 "jna paeth_bbc2 \n\t"
4126 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4127 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4128 "jmp paeth_paeth2 \n\t"
4130 "paeth_bbc2: \n\t"
4131 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4132 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4133 "jmp paeth_paeth2 \n\t"
4135 "paeth_abb2: \n\t"
4136 // pa <= pb; now test if pa <= pc
4137 "cmpl _pctemp, %%eax \n\t"
4138 "jna paeth_abc2 \n\t"
4139 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4140 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4141 "jmp paeth_paeth2 \n\t"
4143 "paeth_abc2: \n\t"
4144 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4145 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4147 "paeth_paeth2: \n\t"
4148 "incl %%ebx \n\t"
4149 "incl %%edx \n\t"
4150 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4151 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4152 "cmpl _FullLength, %%ebx \n\t"
4153 "jb paeth_lp2 \n\t"
4155 "paeth_end: \n\t"
4156 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4157 #ifdef __PIC__
4158 "popl %%ebx \n\t" // restore index to Global Offset Table
4159 #endif
4161 : "=c" (dummy_value_c), // output regs (dummy)
4162 "=S" (dummy_value_S),
4163 "=D" (dummy_value_D)
4165 : "0" (bpp), // ecx // input regs
4166 "1" (prev_row), // esi
4167 "2" (row) // edi
4169 : "%eax", "%edx" // clobber list (no input regs!)
4170 #ifndef __PIC__
4171 , "%ebx"
4172 #endif
4173 );
4175 } /* end png_read_filter_row_mmx_paeth() */
4180 //===========================================================================//
4181 // //
4182 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4183 // //
4184 //===========================================================================//
4186 // Optimized code for PNG Sub filter decoder
4188 static void /* PRIVATE */
4189 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4190 {
4191 int bpp;
4192 int dummy_value_a;
4193 int dummy_value_D;
4195 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4196 _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4198 __asm__ __volatile__ (
4199 //pre "movl row, %%edi \n\t"
4200 "movl %%edi, %%esi \n\t" // lp = row
4201 //pre "movl bpp, %%eax \n\t"
4202 "addl %%eax, %%edi \n\t" // rp = row + bpp
4203 //irr "xorl %%eax, %%eax \n\t"
4204 // get # of bytes to alignment
4205 "movl %%edi, _dif \n\t" // take start of row
4206 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4207 // alignment boundary
4208 "xorl %%ecx, %%ecx \n\t"
4209 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4210 "subl %%edi, _dif \n\t" // subtract from start ==> value
4211 "jz sub_go \n\t" // ecx at alignment
4213 "sub_lp1: \n\t" // fix alignment
4214 "movb (%%esi,%%ecx,), %%al \n\t"
4215 "addb %%al, (%%edi,%%ecx,) \n\t"
4216 "incl %%ecx \n\t"
4217 "cmpl _dif, %%ecx \n\t"
4218 "jb sub_lp1 \n\t"
4220 "sub_go: \n\t"
4221 "movl _FullLength, %%eax \n\t"
4222 "movl %%eax, %%edx \n\t"
4223 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4224 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4225 "subl %%edx, %%eax \n\t" // drop over bytes from length
4226 "movl %%eax, _MMXLength \n\t"
4228 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4229 "=D" (dummy_value_D) // 1
4231 : "0" (bpp), // eax // input regs
4232 "1" (row) // edi
4234 : "%ebx", "%ecx", "%edx" // clobber list
4235 , "%esi"
4237 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4238 , "%mm0", "%mm1", "%mm2", "%mm3"
4239 , "%mm4", "%mm5", "%mm6", "%mm7"
4240 #endif
4241 );
4243 // now do the math for the rest of the row
4244 switch (bpp)
4245 {
4246 case 3:
4247 {
4248 _ActiveMask.use = 0x0000ffffff000000LL;
4249 _ShiftBpp.use = 24; // == 3 * 8
4250 _ShiftRem.use = 40; // == 64 - 24
4252 __asm__ __volatile__ (
4253 // preload "movl row, %%edi \n\t"
4254 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4255 // active byte group
4256 "movl %%edi, %%esi \n\t" // lp = row
4257 // preload "movl bpp, %%eax \n\t"
4258 "addl %%eax, %%edi \n\t" // rp = row + bpp
4259 "movq %%mm7, %%mm6 \n\t"
4260 "movl _dif, %%edx \n\t"
4261 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4262 // 3rd active byte group
4263 // prime the pump: load the first Raw(x-bpp) data set
4264 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4266 "sub_3lp: \n\t" // shift data for adding first
4267 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4268 // shift clears inactive bytes)
4269 // add 1st active group
4270 "movq (%%edi,%%edx,), %%mm0 \n\t"
4271 "paddb %%mm1, %%mm0 \n\t"
4273 // add 2nd active group
4274 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4275 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4276 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4277 "paddb %%mm1, %%mm0 \n\t"
4279 // add 3rd active group
4280 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4281 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4282 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4283 "addl $8, %%edx \n\t"
4284 "paddb %%mm1, %%mm0 \n\t"
4286 "cmpl _MMXLength, %%edx \n\t"
4287 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4288 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4289 "jb sub_3lp \n\t"
4291 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4292 "=D" (dummy_value_D) // 1
4294 : "0" (bpp), // eax // input regs
4295 "1" (row) // edi
4297 : "%edx", "%esi" // clobber list
4298 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4299 , "%mm0", "%mm1", "%mm6", "%mm7"
4300 #endif
4301 );
4302 }
4303 break;
4305 case 1:
4306 {
4307 __asm__ __volatile__ (
4308 "movl _dif, %%edx \n\t"
4309 // preload "movl row, %%edi \n\t"
4310 "cmpl _FullLength, %%edx \n\t"
4311 "jnb sub_1end \n\t"
4312 "movl %%edi, %%esi \n\t" // lp = row
4313 "xorl %%eax, %%eax \n\t"
4314 // preload "movl bpp, %%eax \n\t"
4315 "addl %%eax, %%edi \n\t" // rp = row + bpp
4317 "sub_1lp: \n\t"
4318 "movb (%%esi,%%edx,), %%al \n\t"
4319 "addb %%al, (%%edi,%%edx,) \n\t"
4320 "incl %%edx \n\t"
4321 "cmpl _FullLength, %%edx \n\t"
4322 "jb sub_1lp \n\t"
4324 "sub_1end: \n\t"
4326 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4327 "=D" (dummy_value_D) // 1
4329 : "0" (bpp), // eax // input regs
4330 "1" (row) // edi
4332 : "%edx", "%esi" // clobber list
4333 );
4334 }
4335 return;
4337 case 6:
4338 case 4:
4339 //case 7: // GRR BOGUS
4340 //case 5: // GRR BOGUS
4341 {
4342 _ShiftBpp.use = bpp << 3;
4343 _ShiftRem.use = 64 - _ShiftBpp.use;
4345 __asm__ __volatile__ (
4346 // preload "movl row, %%edi \n\t"
4347 "movl _dif, %%edx \n\t"
4348 "movl %%edi, %%esi \n\t" // lp = row
4349 // preload "movl bpp, %%eax \n\t"
4350 "addl %%eax, %%edi \n\t" // rp = row + bpp
4352 // prime the pump: load the first Raw(x-bpp) data set
4353 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4355 "sub_4lp: \n\t" // shift data for adding first
4356 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4357 // shift clears inactive bytes)
4358 "movq (%%edi,%%edx,), %%mm0 \n\t"
4359 "paddb %%mm1, %%mm0 \n\t"
4361 // add 2nd active group
4362 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4363 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4364 "addl $8, %%edx \n\t"
4365 "paddb %%mm1, %%mm0 \n\t"
4367 "cmpl _MMXLength, %%edx \n\t"
4368 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4369 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4370 "jb sub_4lp \n\t"
4372 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4373 "=D" (dummy_value_D) // 1
4375 : "0" (bpp), // eax // input regs
4376 "1" (row) // edi
4378 : "%edx", "%esi" // clobber list
4379 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4380 , "%mm0", "%mm1"
4381 #endif
4382 );
4383 }
4384 break;
4386 case 2:
4387 {
4388 _ActiveMask.use = 0x00000000ffff0000LL;
4389 _ShiftBpp.use = 16; // == 2 * 8
4390 _ShiftRem.use = 48; // == 64 - 16
4392 __asm__ __volatile__ (
4393 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4394 // active byte group
4395 "movl _dif, %%edx \n\t"
4396 "movq %%mm7, %%mm6 \n\t"
4397 // preload "movl row, %%edi \n\t"
4398 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4399 // 3rd active byte group
4400 "movl %%edi, %%esi \n\t" // lp = row
4401 "movq %%mm6, %%mm5 \n\t"
4402 // preload "movl bpp, %%eax \n\t"
4403 "addl %%eax, %%edi \n\t" // rp = row + bpp
4404 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4405 // 4th active byte group
4406 // prime the pump: load the first Raw(x-bpp) data set
4407 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4409 "sub_2lp: \n\t" // shift data for adding first
4410 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4411 // shift clears inactive bytes)
4412 // add 1st active group
4413 "movq (%%edi,%%edx,), %%mm0 \n\t"
4414 "paddb %%mm1, %%mm0 \n\t"
4416 // add 2nd active group
4417 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4418 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4419 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4420 "paddb %%mm1, %%mm0 \n\t"
4422 // add 3rd active group
4423 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4424 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4425 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4426 "paddb %%mm1, %%mm0 \n\t"
4428 // add 4th active group
4429 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4430 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4431 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4432 "addl $8, %%edx \n\t"
4433 "paddb %%mm1, %%mm0 \n\t"
4434 "cmpl _MMXLength, %%edx \n\t"
4435 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4436 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4437 "jb sub_2lp \n\t"
4439 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4440 "=D" (dummy_value_D) // 1
4442 : "0" (bpp), // eax // input regs
4443 "1" (row) // edi
4445 : "%edx", "%esi" // clobber list
4446 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4447 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4448 #endif
4449 );
4450 }
4451 break;
4453 case 8:
4454 {
4455 __asm__ __volatile__ (
4456 // preload "movl row, %%edi \n\t"
4457 "movl _dif, %%edx \n\t"
4458 "movl %%edi, %%esi \n\t" // lp = row
4459 // preload "movl bpp, %%eax \n\t"
4460 "addl %%eax, %%edi \n\t" // rp = row + bpp
4461 "movl _MMXLength, %%ecx \n\t"
4463 // prime the pump: load the first Raw(x-bpp) data set
4464 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4465 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4467 "sub_8lp: \n\t"
4468 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4469 "paddb %%mm7, %%mm0 \n\t"
4470 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4471 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4473 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4474 // This will be repeated for each group of 8 bytes with the 8th
4475 // group being used as the Raw(x-bpp) for the 1st group of the
4476 // next loop.
4478 "paddb %%mm0, %%mm1 \n\t"
4479 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4480 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4481 "paddb %%mm1, %%mm2 \n\t"
4482 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4483 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4484 "paddb %%mm2, %%mm3 \n\t"
4485 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4486 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4487 "paddb %%mm3, %%mm4 \n\t"
4488 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4489 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4490 "paddb %%mm4, %%mm5 \n\t"
4491 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4492 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4493 "paddb %%mm5, %%mm6 \n\t"
4494 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4495 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4496 "addl $64, %%edx \n\t"
4497 "paddb %%mm6, %%mm7 \n\t"
4498 "cmpl %%ecx, %%edx \n\t"
4499 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4500 "jb sub_8lp \n\t"
4502 "cmpl _MMXLength, %%edx \n\t"
4503 "jnb sub_8lt8 \n\t"
4505 "sub_8lpA: \n\t"
4506 "movq (%%edi,%%edx,), %%mm0 \n\t"
4507 "addl $8, %%edx \n\t"
4508 "paddb %%mm7, %%mm0 \n\t"
4509 "cmpl _MMXLength, %%edx \n\t"
4510 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4511 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4512 // to mm1 to be new Raw(x-bpp)
4513 // for next loop
4514 "jb sub_8lpA \n\t"
4516 "sub_8lt8: \n\t"
4518 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4519 "=D" (dummy_value_D) // 1
4521 : "0" (bpp), // eax // input regs
4522 "1" (row) // edi
4524 : "%ecx", "%edx", "%esi" // clobber list
4525 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4526 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4527 #endif
4528 );
4529 }
4530 break;
4532 default: // bpp greater than 8 bytes GRR BOGUS
4533 {
4534 __asm__ __volatile__ (
4535 "movl _dif, %%edx \n\t"
4536 // preload "movl row, %%edi \n\t"
4537 "movl %%edi, %%esi \n\t" // lp = row
4538 // preload "movl bpp, %%eax \n\t"
4539 "addl %%eax, %%edi \n\t" // rp = row + bpp
4541 "sub_Alp: \n\t"
4542 "movq (%%edi,%%edx,), %%mm0 \n\t"
4543 "movq (%%esi,%%edx,), %%mm1 \n\t"
4544 "addl $8, %%edx \n\t"
4545 "paddb %%mm1, %%mm0 \n\t"
4546 "cmpl _MMXLength, %%edx \n\t"
4547 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4548 // -8 to offset addl edx
4549 "jb sub_Alp \n\t"
4551 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4552 "=D" (dummy_value_D) // 1
4554 : "0" (bpp), // eax // input regs
4555 "1" (row) // edi
4557 : "%edx", "%esi" // clobber list
4558 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4559 , "%mm0", "%mm1"
4560 #endif
4561 );
4562 }
4563 break;
4565 } // end switch (bpp)
4567 __asm__ __volatile__ (
4568 "movl _MMXLength, %%edx \n\t"
4569 //pre "movl row, %%edi \n\t"
4570 "cmpl _FullLength, %%edx \n\t"
4571 "jnb sub_end \n\t"
4573 "movl %%edi, %%esi \n\t" // lp = row
4574 //pre "movl bpp, %%eax \n\t"
4575 "addl %%eax, %%edi \n\t" // rp = row + bpp
4576 "xorl %%eax, %%eax \n\t"
4578 "sub_lp2: \n\t"
4579 "movb (%%esi,%%edx,), %%al \n\t"
4580 "addb %%al, (%%edi,%%edx,) \n\t"
4581 "incl %%edx \n\t"
4582 "cmpl _FullLength, %%edx \n\t"
4583 "jb sub_lp2 \n\t"
4585 "sub_end: \n\t"
4586 "EMMS \n\t" // end MMX instructions
4588 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4589 "=D" (dummy_value_D) // 1
4591 : "0" (bpp), // eax // input regs
4592 "1" (row) // edi
4594 : "%edx", "%esi" // clobber list
4595 );
4597 } // end of png_read_filter_row_mmx_sub()
4602 //===========================================================================//
4603 // //
4604 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4605 // //
4606 //===========================================================================//
4608 // Optimized code for PNG Up filter decoder
4610 static void /* PRIVATE */
4611 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4612 png_bytep prev_row)
4613 {
4614 png_uint_32 len;
4615 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4616 int dummy_value_S;
4617 int dummy_value_D;
4619 len = row_info->rowbytes; // number of bytes to filter
4621 __asm__ __volatile__ (
4622 //pre "movl row, %%edi \n\t"
4623 // get # of bytes to alignment
4624 "movl %%edi, %%ecx \n\t"
4625 "xorl %%ebx, %%ebx \n\t"
4626 "addl $0x7, %%ecx \n\t"
4627 "xorl %%eax, %%eax \n\t"
4628 "andl $0xfffffff8, %%ecx \n\t"
4629 //pre "movl prev_row, %%esi \n\t"
4630 "subl %%edi, %%ecx \n\t"
4631 "jz up_go \n\t"
4633 "up_lp1: \n\t" // fix alignment
4634 "movb (%%edi,%%ebx,), %%al \n\t"
4635 "addb (%%esi,%%ebx,), %%al \n\t"
4636 "incl %%ebx \n\t"
4637 "cmpl %%ecx, %%ebx \n\t"
4638 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4639 "jb up_lp1 \n\t" // offset incl ebx
4641 "up_go: \n\t"
4642 //pre "movl len, %%edx \n\t"
4643 "movl %%edx, %%ecx \n\t"
4644 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4645 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4646 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4648 // unrolled loop - use all MMX registers and interleave to reduce
4649 // number of branch instructions (loops) and reduce partial stalls
4650 "up_loop: \n\t"
4651 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4652 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4653 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4654 "paddb %%mm1, %%mm0 \n\t"
4655 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4656 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4657 "paddb %%mm3, %%mm2 \n\t"
4658 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4659 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4660 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4661 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4662 "paddb %%mm5, %%mm4 \n\t"
4663 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4664 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4665 "paddb %%mm7, %%mm6 \n\t"
4666 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4667 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4668 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4669 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4670 "paddb %%mm1, %%mm0 \n\t"
4671 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4672 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4673 "paddb %%mm3, %%mm2 \n\t"
4674 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4675 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4676 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4677 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4678 "paddb %%mm5, %%mm4 \n\t"
4679 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4680 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4681 "addl $64, %%ebx \n\t"
4682 "paddb %%mm7, %%mm6 \n\t"
4683 "cmpl %%ecx, %%ebx \n\t"
4684 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4685 "jb up_loop \n\t" // -8 to offset addl ebx
4687 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
4688 "jz up_end \n\t"
4690 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
4691 "jb up_lt8 \n\t" // [added by lcreeve@netins.net]
4693 "addl %%edx, %%ecx \n\t"
4694 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4695 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4696 "jz up_lt8 \n\t"
4698 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
4699 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4700 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4701 "addl $8, %%ebx \n\t"
4702 "paddb %%mm1, %%mm0 \n\t"
4703 "cmpl %%ecx, %%ebx \n\t"
4704 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4705 "jb up_lpA \n\t" // offset add ebx
4706 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
4707 "jz up_end \n\t"
4709 "up_lt8: \n\t"
4710 "xorl %%eax, %%eax \n\t"
4711 "addl %%edx, %%ecx \n\t" // move over byte count into counter
4713 "up_lp2: \n\t" // use x86 regs for remaining bytes
4714 "movb (%%edi,%%ebx,), %%al \n\t"
4715 "addb (%%esi,%%ebx,), %%al \n\t"
4716 "incl %%ebx \n\t"
4717 "cmpl %%ecx, %%ebx \n\t"
4718 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4719 "jb up_lp2 \n\t" // offset inc ebx
4721 "up_end: \n\t"
4722 "EMMS \n\t" // conversion of filtered row complete
4724 : "=d" (dummy_value_d), // 0 // output regs (dummy)
4725 "=S" (dummy_value_S), // 1
4726 "=D" (dummy_value_D) // 2
4728 : "0" (len), // edx // input regs
4729 "1" (prev_row), // esi
4730 "2" (row) // edi
4732 : "%eax", "%ebx", "%ecx" // clobber list (no input regs!)
4734 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4735 , "%mm0", "%mm1", "%mm2", "%mm3"
4736 , "%mm4", "%mm5", "%mm6", "%mm7"
4737 #endif
4738 );
4740 } // end of png_read_filter_row_mmx_up()
4745 //===========================================================================//
4746 // //
4747 // P N G _ R E A D _ F I L T E R _ R O W //
4748 // //
4749 //===========================================================================//
4751 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
4753 // Optimized png_read_filter_row routines
4755 void /* PRIVATE */
4756 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
4757 row, png_bytep prev_row, int filter)
4758 {
4759 #ifdef PNG_DEBUG
4760 char filnm[10];
4761 #endif
4763 /* GRR: these are superseded by png_ptr->asm_flags: */
4764 #define UseMMX_sub 1 // GRR: converted 20000730
4765 #define UseMMX_up 1 // GRR: converted 20000729
4766 #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
4767 #define UseMMX_paeth 1 // GRR: converted 20000828
4769 if (_mmx_supported == 2) {
4770 png_mmx_support();
4771 }
4773 #ifdef PNG_DEBUG
4774 png_debug(1, "in png_read_filter_row\n");
4775 switch (filter)
4776 {
4777 case 0: sprintf(filnm, "none");
4778 break;
4779 case 1: sprintf(filnm, "sub-%s", "MMX");
4780 break;
4781 case 2: sprintf(filnm, "up-%s", "MMX");
4782 break;
4783 case 3: sprintf(filnm, "avg-%s", "MMX");
4784 break;
4785 case 4: sprintf(filnm, "Paeth-%s", "MMX");
4786 break;
4787 default: sprintf(filnm, "unknw");
4788 break;
4789 }
4790 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
4791 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
4792 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
4793 (int)((row_info->pixel_depth + 7) >> 3));
4794 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
4795 #endif /* PNG_DEBUG */
4797 switch (filter)
4798 {
4799 case PNG_FILTER_VALUE_NONE:
4800 break;
4802 case PNG_FILTER_VALUE_SUB:
4803 if (
4804 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4805 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4806 {
4807 png_read_filter_row_mmx_sub(row_info, row);
4808 }
4809 else
4810 {
4811 png_uint_32 i;
4812 png_uint_32 istop = row_info->rowbytes;
4813 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4814 png_bytep rp = row + bpp;
4815 png_bytep lp = row;
4817 for (i = bpp; i < istop; i++)
4818 {
4819 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
4820 rp++;
4821 }
4822 } //end !UseMMX_sub
4823 break;
4825 case PNG_FILTER_VALUE_UP:
4826 if (
4827 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4828 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4829 {
4830 png_read_filter_row_mmx_up(row_info, row, prev_row);
4831 }
4832 else
4833 {
4834 png_uint_32 i;
4835 png_uint_32 istop = row_info->rowbytes;
4836 png_bytep rp = row;
4837 png_bytep pp = prev_row;
4839 for (i = 0; i < istop; ++i)
4840 {
4841 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4842 rp++;
4843 }
4844 } //end !UseMMX_up
4845 break;
4847 case PNG_FILTER_VALUE_AVG:
4848 if (
4849 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4850 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4851 {
4852 png_read_filter_row_mmx_avg(row_info, row, prev_row);
4853 }
4854 else
4855 {
4856 png_uint_32 i;
4857 png_bytep rp = row;
4858 png_bytep pp = prev_row;
4859 png_bytep lp = row;
4860 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4861 png_uint_32 istop = row_info->rowbytes - bpp;
4863 for (i = 0; i < bpp; i++)
4864 {
4865 *rp = (png_byte)(((int)(*rp) +
4866 ((int)(*pp++) >> 1)) & 0xff);
4867 rp++;
4868 }
4870 for (i = 0; i < istop; i++)
4871 {
4872 *rp = (png_byte)(((int)(*rp) +
4873 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
4874 rp++;
4875 }
4876 } //end !UseMMX_avg
4877 break;
4879 case PNG_FILTER_VALUE_PAETH:
4880 if (
4881 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4882 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4883 {
4884 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
4885 }
4886 else
4887 {
4888 png_uint_32 i;
4889 png_bytep rp = row;
4890 png_bytep pp = prev_row;
4891 png_bytep lp = row;
4892 png_bytep cp = prev_row;
4893 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4894 png_uint_32 istop = row_info->rowbytes - bpp;
4896 for (i = 0; i < bpp; i++)
4897 {
4898 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4899 rp++;
4900 }
4902 for (i = 0; i < istop; i++) /* use leftover rp,pp */
4903 {
4904 int a, b, c, pa, pb, pc, p;
4906 a = *lp++;
4907 b = *pp++;
4908 c = *cp++;
4910 p = b - c;
4911 pc = a - c;
4913 #ifdef PNG_USE_ABS
4914 pa = abs(p);
4915 pb = abs(pc);
4916 pc = abs(p + pc);
4917 #else
4918 pa = p < 0 ? -p : p;
4919 pb = pc < 0 ? -pc : pc;
4920 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
4921 #endif
4923 /*
4924 if (pa <= pb && pa <= pc)
4925 p = a;
4926 else if (pb <= pc)
4927 p = b;
4928 else
4929 p = c;
4930 */
4932 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
4934 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
4935 rp++;
4936 }
4937 } //end !UseMMX_paeth
4938 break;
4940 default:
4941 png_warning(png_ptr, "Ignoring bad row-filter type");
4942 *row=0;
4943 break;
4944 }
4945 }
4947 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
4952 //===========================================================================//
4953 // //
4954 // P N G _ M M X _ S U P P O R T //
4955 // //
4956 //===========================================================================//
4958 // GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
4959 // (2) all instructions compile with gcc 2.7.2.3 and later
4960 // (3) the function is moved down here to prevent gcc from
4961 // inlining it in multiple places and then barfing be-
4962 // cause the ".NOT_SUPPORTED" label is multiply defined
4963 // [is there a way to signal that a *single* function should
4964 // not be inlined? is there a way to modify the label for
4965 // each inlined instance, e.g., by appending _1, _2, etc.?
4966 // maybe if don't use leading "." in label name? (nope...sigh)]
4968 // GRR TO DO: make sure PNGAPI doesn't do/require anything screwy here
4969 // [looks OK for everybody except possibly Cygwin (__cdecl)]
4971 int PNGAPI
4972 png_mmx_support(void)
4973 {
4974 __asm__ __volatile__ (
4975 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
4976 "pushl %%ecx \n\t" // so does ecx...
4977 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
4978 // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
4979 // "pushf \n\t" // 16-bit pushf
4980 "pushfl \n\t" // save Eflag to stack
4981 "popl %%eax \n\t" // get Eflag from stack into eax
4982 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
4983 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
4984 "pushl %%eax \n\t" // save modified Eflag back to stack
4985 // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
4986 // "popf \n\t" // 16-bit popf
4987 "popfl \n\t" // restore modified value to Eflag reg
4988 "pushfl \n\t" // save Eflag to stack
4989 "popl %%eax \n\t" // get Eflag from stack
4990 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
4991 "jz .NOT_SUPPORTED \n\t" // if same, CPUID instr. is not supported
4993 "xorl %%eax, %%eax \n\t" // set eax to zero
4994 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
4995 "cpuid \n\t" // get the CPU identification info
4996 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
4997 "jl .NOT_SUPPORTED \n\t" // if eax is zero, MMX is not supported
4999 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5000 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5001 // faster than the instruction "mov eax, 1"
5002 "cpuid \n\t" // get the CPU identification info again
5003 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5004 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5005 "jz .NOT_SUPPORTED \n\t" // non-zero = yes, MMX IS supported
5007 "movl $1, %%eax \n\t" // set return value to 1
5008 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5009 "popl %%edx \n\t" // restore edx
5010 "popl %%ecx \n\t" // restore ecx
5011 "popl %%ebx \n\t" // restore ebx ("row" in png_do_interlace)
5012 "ret \n\t" // DONE: have MMX support
5014 ".NOT_SUPPORTED: \n\t" // target label for jump instructions
5015 "movl $0, %%eax \n\t" // set return value to 0
5016 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5017 "popl %%edx \n\t" // restore edx
5018 "popl %%ecx \n\t" // restore ecx
5019 "popl %%ebx \n\t" // restore ebx ("row" in png_do_interlace)
5020 // "ret \n\t" // DONE: no MMX support
5021 // (fall through to standard C "ret")
5023 : // output list (none)
5025 : // any variables used on input (none)
5027 : "%eax" // clobber list
5028 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5029 // , "memory" // if write to a variable gcc thought was in a reg
5030 // , "cc" // "condition codes" (flag bits)
5031 );
5033 // return %%eax;
5034 }
5036 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */