1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4 *
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
8 *
9 * libpng version 1.2.0 - September 1, 2001
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
13 *
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
17 *
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19 *
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21 *
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
23 *
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
26 * For djgpp, see
27 *
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29 *
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
32 *
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36 * binutils.tgz
37 *
38 * For other platforms, see the main GNU site:
39 *
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
41 *
42 * Version 2.5.2l.15 is definitely too old...
43 */
45 /*
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
48 *
49 * 19991006:
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51 *
52 * 19991007:
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
79 *
80 * 19991016:
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
89 *
90 * 19991023:
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
96 *
97 * 19991024:
98 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_read_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
113 *
114 * 19991107:
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
117 *
118 * 19991120:
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
124 *
125 * 20000213:
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
127 *
128 * 20000319:
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
131 *
132 * 20000623:
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
139 *
140 * 20000706:
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
152 *
153 * 20000729:
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
160 * mapping of latter)
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
163 *
164 * 20000731:
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166 *
167 * 20000822:
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
173 *
174 * 20000823:
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
178 *
179 * 20000826:
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
183 *
184 * 20000828:
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
189 *
190 * 20000829:
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
195 *
196 * 20000914:
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
199 *
200 * 20000916:
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
205 *
206 * 20010101:
207 * - added new png_init_mmx_flags() function (here only because it needs to
208 * call mmxsupport(), which should probably become global png_mmxsupport());
209 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
210 *
211 * 20010103:
212 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213 * and made it public; moved png_init_mmx_flags() to png.c as internal func
214 *
215 * 20010104:
216 * - removed dependency on png_read_filter_row_c() (C code already duplicated
217 * within MMX version of png_read_filter_row()) so no longer necessary to
218 * compile it into pngrutil.o
219 *
220 * 20010310:
221 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222 *
223 * STILL TO DO:
224 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
225 * - write MMX code for 48-bit case (pixel_bytes == 6)
226 * - figure out what's up with 24-bit case (pixel_bytes == 3):
227 * why subtract 8 from width_mmx in the pass 4/5 case?
228 * (only width_mmx case) (near line 1606)
229 * - rewrite all MMX interlacing code so it's aligned with beginning
230 * of the row buffer, not the end (see 19991007 for details)
231 * x pick one version of mmxsupport() and get rid of the other
232 * - add error messages to any remaining bogus default cases
233 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
234 * x add support for runtime enable/disable/query of various MMX routines
235 */
237 #define PNG_INTERNAL
238 #include "png.h"
240 #if defined(PNG_USE_PNGGCCRD)
242 int PNGAPI png_mmx_support(void);
244 #ifdef PNG_USE_LOCAL_ARRAYS
245 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
246 static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
247 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
248 #endif
250 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
251 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
252 * so define them without: */
253 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
254 # define _mmx_supported mmx_supported
255 # define _const4 const4
256 # define _const6 const6
257 # define _mask8_0 mask8_0
258 # define _mask16_1 mask16_1
259 # define _mask16_0 mask16_0
260 # define _mask24_2 mask24_2
261 # define _mask24_1 mask24_1
262 # define _mask24_0 mask24_0
263 # define _mask32_3 mask32_3
264 # define _mask32_2 mask32_2
265 # define _mask32_1 mask32_1
266 # define _mask32_0 mask32_0
267 # define _mask48_5 mask48_5
268 # define _mask48_4 mask48_4
269 # define _mask48_3 mask48_3
270 # define _mask48_2 mask48_2
271 # define _mask48_1 mask48_1
272 # define _mask48_0 mask48_0
273 # define _LBCarryMask LBCarryMask
274 # define _HBClearMask HBClearMask
275 # define _ActiveMask ActiveMask
276 # define _ActiveMask2 ActiveMask2
277 # define _ActiveMaskEnd ActiveMaskEnd
278 # define _ShiftBpp ShiftBpp
279 # define _ShiftRem ShiftRem
280 #ifdef PNG_THREAD_UNSAFE_OK
281 # define _unmask unmask
282 # define _FullLength FullLength
283 # define _MMXLength MMXLength
284 # define _dif dif
285 # define _patemp patemp
286 # define _pbtemp pbtemp
287 # define _pctemp pctemp
288 #endif
289 #endif
292 /* These constants are used in the inlined MMX assembly code.
293 Ignore gcc's "At top level: defined but not used" warnings. */
295 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
296 * since that case uses the %ebx register for indexing the Global Offset Table
297 * and there were no other registers available. But gcc 2.95 and later emit
298 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
299 * in the non-PIC case, so we'll just use the global unconditionally now.
300 */
301 #ifdef PNG_THREAD_UNSAFE_OK
302 static int _unmask;
303 #endif
305 static unsigned long long _mask8_0 = 0x0102040810204080LL;
307 static unsigned long long _mask16_1 = 0x0101020204040808LL;
308 static unsigned long long _mask16_0 = 0x1010202040408080LL;
310 static unsigned long long _mask24_2 = 0x0101010202020404LL;
311 static unsigned long long _mask24_1 = 0x0408080810101020LL;
312 static unsigned long long _mask24_0 = 0x2020404040808080LL;
314 static unsigned long long _mask32_3 = 0x0101010102020202LL;
315 static unsigned long long _mask32_2 = 0x0404040408080808LL;
316 static unsigned long long _mask32_1 = 0x1010101020202020LL;
317 static unsigned long long _mask32_0 = 0x4040404080808080LL;
319 static unsigned long long _mask48_5 = 0x0101010101010202LL;
320 static unsigned long long _mask48_4 = 0x0202020204040404LL;
321 static unsigned long long _mask48_3 = 0x0404080808080808LL;
322 static unsigned long long _mask48_2 = 0x1010101010102020LL;
323 static unsigned long long _mask48_1 = 0x2020202040404040LL;
324 static unsigned long long _mask48_0 = 0x4040808080808080LL;
326 static unsigned long long _const4 = 0x0000000000FFFFFFLL;
327 //static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
328 static unsigned long long _const6 = 0x00000000000000FFLL;
330 // These are used in the row-filter routines and should/would be local
331 // variables if not for gcc addressing limitations.
332 // WARNING: Their presence probably defeats the thread safety of libpng.
334 #ifdef PNG_THREAD_UNSAFE_OK
335 static png_uint_32 _FullLength;
336 static png_uint_32 _MMXLength;
337 static int _dif;
338 static int _patemp; // temp variables for Paeth routine
339 static int _pbtemp;
340 static int _pctemp;
341 #endif
343 void /* PRIVATE */
344 png_squelch_warnings(void)
345 {
346 #ifdef PNG_THREAD_UNSAFE_OK
347 _dif = _dif;
348 _patemp = _patemp;
349 _pbtemp = _pbtemp;
350 _pctemp = _pctemp;
351 _MMXLength = _MMXLength;
352 #endif
353 _const4 = _const4;
354 _const6 = _const6;
355 _mask8_0 = _mask8_0;
356 _mask16_1 = _mask16_1;
357 _mask16_0 = _mask16_0;
358 _mask24_2 = _mask24_2;
359 _mask24_1 = _mask24_1;
360 _mask24_0 = _mask24_0;
361 _mask32_3 = _mask32_3;
362 _mask32_2 = _mask32_2;
363 _mask32_1 = _mask32_1;
364 _mask32_0 = _mask32_0;
365 _mask48_5 = _mask48_5;
366 _mask48_4 = _mask48_4;
367 _mask48_3 = _mask48_3;
368 _mask48_2 = _mask48_2;
369 _mask48_1 = _mask48_1;
370 _mask48_0 = _mask48_0;
371 }
372 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
375 static int _mmx_supported = 2;
377 /*===========================================================================*/
378 /* */
379 /* P N G _ C O M B I N E _ R O W */
380 /* */
381 /*===========================================================================*/
383 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
385 #define BPP2 2
386 #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
387 #define BPP4 4
388 #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
389 #define BPP8 8
391 /* Combines the row recently read in with the previous row.
392 This routine takes care of alpha and transparency if requested.
393 This routine also handles the two methods of progressive display
394 of interlaced images, depending on the mask value.
395 The mask value describes which pixels are to be combined with
396 the row. The pattern always repeats every 8 pixels, so just 8
397 bits are needed. A one indicates the pixel is to be combined; a
398 zero indicates the pixel is to be skipped. This is in addition
399 to any alpha or transparency value associated with the pixel.
400 If you want all pixels to be combined, pass 0xff (255) in mask. */
402 /* Use this routine for the x86 platform - it uses a faster MMX routine
403 if the machine supports MMX. */
405 void /* PRIVATE */
406 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
407 {
408 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
410 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
411 if (_mmx_supported == 2) {
412 /* this should have happened in png_init_mmx_flags() already */
413 png_warning(png_ptr, "asm_flags may not have been initialized");
414 png_mmx_support();
415 }
416 #endif
418 if (mask == 0xff)
419 {
420 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
421 png_memcpy(row, png_ptr->row_buf + 1,
422 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
423 }
424 else /* (png_combine_row() is never called with mask == 0) */
425 {
426 switch (png_ptr->row_info.pixel_depth)
427 {
428 case 1: /* png_ptr->row_info.pixel_depth */
429 {
430 png_bytep sp;
431 png_bytep dp;
432 int s_inc, s_start, s_end;
433 int m;
434 int shift;
435 png_uint_32 i;
437 sp = png_ptr->row_buf + 1;
438 dp = row;
439 m = 0x80;
440 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
441 if (png_ptr->transformations & PNG_PACKSWAP)
442 {
443 s_start = 0;
444 s_end = 7;
445 s_inc = 1;
446 }
447 else
448 #endif
449 {
450 s_start = 7;
451 s_end = 0;
452 s_inc = -1;
453 }
455 shift = s_start;
457 for (i = 0; i < png_ptr->width; i++)
458 {
459 if (m & mask)
460 {
461 int value;
463 value = (*sp >> shift) & 0x1;
464 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
465 *dp |= (png_byte)(value << shift);
466 }
468 if (shift == s_end)
469 {
470 shift = s_start;
471 sp++;
472 dp++;
473 }
474 else
475 shift += s_inc;
477 if (m == 1)
478 m = 0x80;
479 else
480 m >>= 1;
481 }
482 break;
483 }
485 case 2: /* png_ptr->row_info.pixel_depth */
486 {
487 png_bytep sp;
488 png_bytep dp;
489 int s_start, s_end, s_inc;
490 int m;
491 int shift;
492 png_uint_32 i;
493 int value;
495 sp = png_ptr->row_buf + 1;
496 dp = row;
497 m = 0x80;
498 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
499 if (png_ptr->transformations & PNG_PACKSWAP)
500 {
501 s_start = 0;
502 s_end = 6;
503 s_inc = 2;
504 }
505 else
506 #endif
507 {
508 s_start = 6;
509 s_end = 0;
510 s_inc = -2;
511 }
513 shift = s_start;
515 for (i = 0; i < png_ptr->width; i++)
516 {
517 if (m & mask)
518 {
519 value = (*sp >> shift) & 0x3;
520 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
521 *dp |= (png_byte)(value << shift);
522 }
524 if (shift == s_end)
525 {
526 shift = s_start;
527 sp++;
528 dp++;
529 }
530 else
531 shift += s_inc;
532 if (m == 1)
533 m = 0x80;
534 else
535 m >>= 1;
536 }
537 break;
538 }
540 case 4: /* png_ptr->row_info.pixel_depth */
541 {
542 png_bytep sp;
543 png_bytep dp;
544 int s_start, s_end, s_inc;
545 int m;
546 int shift;
547 png_uint_32 i;
548 int value;
550 sp = png_ptr->row_buf + 1;
551 dp = row;
552 m = 0x80;
553 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
554 if (png_ptr->transformations & PNG_PACKSWAP)
555 {
556 s_start = 0;
557 s_end = 4;
558 s_inc = 4;
559 }
560 else
561 #endif
562 {
563 s_start = 4;
564 s_end = 0;
565 s_inc = -4;
566 }
567 shift = s_start;
569 for (i = 0; i < png_ptr->width; i++)
570 {
571 if (m & mask)
572 {
573 value = (*sp >> shift) & 0xf;
574 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
575 *dp |= (png_byte)(value << shift);
576 }
578 if (shift == s_end)
579 {
580 shift = s_start;
581 sp++;
582 dp++;
583 }
584 else
585 shift += s_inc;
586 if (m == 1)
587 m = 0x80;
588 else
589 m >>= 1;
590 }
591 break;
592 }
594 case 8: /* png_ptr->row_info.pixel_depth */
595 {
596 png_bytep srcptr;
597 png_bytep dstptr;
599 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
600 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
601 /* && _mmx_supported */ )
602 {
603 png_uint_32 len;
604 int diff;
605 int dummy_value_a; // fix 'forbidden register spilled' error
606 int dummy_value_d;
607 int dummy_value_c;
608 int dummy_value_S;
609 int dummy_value_D;
610 _unmask = ~mask; // global variable for -fPIC version
611 srcptr = png_ptr->row_buf + 1;
612 dstptr = row;
613 len = png_ptr->width &~7; // reduce to multiple of 8
614 diff = (int) (png_ptr->width & 7); // amount lost
616 __asm__ __volatile__ (
617 "movd _unmask, %%mm7 \n\t" // load bit pattern
618 "psubb %%mm6, %%mm6 \n\t" // zero mm6
619 "punpcklbw %%mm7, %%mm7 \n\t"
620 "punpcklwd %%mm7, %%mm7 \n\t"
621 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
623 "movq _mask8_0, %%mm0 \n\t"
624 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
625 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
627 // preload "movl len, %%ecx \n\t" // load length of line
628 // preload "movl srcptr, %%esi \n\t" // load source
629 // preload "movl dstptr, %%edi \n\t" // load dest
631 "cmpl $0, %%ecx \n\t" // len == 0 ?
632 "je mainloop8end \n\t"
634 "mainloop8: \n\t"
635 "movq (%%esi), %%mm4 \n\t" // *srcptr
636 "pand %%mm0, %%mm4 \n\t"
637 "movq %%mm0, %%mm6 \n\t"
638 "pandn (%%edi), %%mm6 \n\t" // *dstptr
639 "por %%mm6, %%mm4 \n\t"
640 "movq %%mm4, (%%edi) \n\t"
641 "addl $8, %%esi \n\t" // inc by 8 bytes processed
642 "addl $8, %%edi \n\t"
643 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
644 "ja mainloop8 \n\t"
646 "mainloop8end: \n\t"
647 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
648 "movl %%eax, %%ecx \n\t"
649 "cmpl $0, %%ecx \n\t"
650 "jz end8 \n\t"
651 // preload "movl mask, %%edx \n\t"
652 "sall $24, %%edx \n\t" // make low byte, high byte
654 "secondloop8: \n\t"
655 "sall %%edx \n\t" // move high bit to CF
656 "jnc skip8 \n\t" // if CF = 0
657 "movb (%%esi), %%al \n\t"
658 "movb %%al, (%%edi) \n\t"
660 "skip8: \n\t"
661 "incl %%esi \n\t"
662 "incl %%edi \n\t"
663 "decl %%ecx \n\t"
664 "jnz secondloop8 \n\t"
666 "end8: \n\t"
667 "EMMS \n\t" // DONE
669 : "=a" (dummy_value_a), // output regs (dummy)
670 "=d" (dummy_value_d),
671 "=c" (dummy_value_c),
672 "=S" (dummy_value_S),
673 "=D" (dummy_value_D)
675 : "3" (srcptr), // esi // input regs
676 "4" (dstptr), // edi
677 "0" (diff), // eax
678 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
679 "2" (len), // ecx
680 "1" (mask) // edx
682 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
683 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
684 #endif
685 );
686 }
687 else /* mmx _not supported - Use modified C routine */
688 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
689 {
690 register png_uint_32 i;
691 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
692 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
693 register int stride = png_pass_inc[png_ptr->pass];
694 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
695 register int rep_bytes = png_pass_width[png_ptr->pass];
696 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
697 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
698 int diff = (int) (png_ptr->width & 7); /* amount lost */
699 register png_uint_32 final_val = len; /* GRR bugfix */
701 srcptr = png_ptr->row_buf + 1 + initial_val;
702 dstptr = row + initial_val;
704 for (i = initial_val; i < final_val; i += stride)
705 {
706 png_memcpy(dstptr, srcptr, rep_bytes);
707 srcptr += stride;
708 dstptr += stride;
709 }
710 if (diff) /* number of leftover pixels: 3 for pngtest */
711 {
712 final_val+=diff /* *BPP1 */ ;
713 for (; i < final_val; i += stride)
714 {
715 if (rep_bytes > (int)(final_val-i))
716 rep_bytes = (int)(final_val-i);
717 png_memcpy(dstptr, srcptr, rep_bytes);
718 srcptr += stride;
719 dstptr += stride;
720 }
721 }
723 } /* end of else (_mmx_supported) */
725 break;
726 } /* end 8 bpp */
728 case 16: /* png_ptr->row_info.pixel_depth */
729 {
730 png_bytep srcptr;
731 png_bytep dstptr;
733 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
734 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
735 /* && _mmx_supported */ )
736 {
737 png_uint_32 len;
738 int diff;
739 int dummy_value_a; // fix 'forbidden register spilled' error
740 int dummy_value_d;
741 int dummy_value_c;
742 int dummy_value_S;
743 int dummy_value_D;
744 _unmask = ~mask; // global variable for -fPIC version
745 srcptr = png_ptr->row_buf + 1;
746 dstptr = row;
747 len = png_ptr->width &~7; // reduce to multiple of 8
748 diff = (int) (png_ptr->width & 7); // amount lost //
750 __asm__ __volatile__ (
751 "movd _unmask, %%mm7 \n\t" // load bit pattern
752 "psubb %%mm6, %%mm6 \n\t" // zero mm6
753 "punpcklbw %%mm7, %%mm7 \n\t"
754 "punpcklwd %%mm7, %%mm7 \n\t"
755 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
757 "movq _mask16_0, %%mm0 \n\t"
758 "movq _mask16_1, %%mm1 \n\t"
760 "pand %%mm7, %%mm0 \n\t"
761 "pand %%mm7, %%mm1 \n\t"
763 "pcmpeqb %%mm6, %%mm0 \n\t"
764 "pcmpeqb %%mm6, %%mm1 \n\t"
766 // preload "movl len, %%ecx \n\t" // load length of line
767 // preload "movl srcptr, %%esi \n\t" // load source
768 // preload "movl dstptr, %%edi \n\t" // load dest
770 "cmpl $0, %%ecx \n\t"
771 "jz mainloop16end \n\t"
773 "mainloop16: \n\t"
774 "movq (%%esi), %%mm4 \n\t"
775 "pand %%mm0, %%mm4 \n\t"
776 "movq %%mm0, %%mm6 \n\t"
777 "movq (%%edi), %%mm7 \n\t"
778 "pandn %%mm7, %%mm6 \n\t"
779 "por %%mm6, %%mm4 \n\t"
780 "movq %%mm4, (%%edi) \n\t"
782 "movq 8(%%esi), %%mm5 \n\t"
783 "pand %%mm1, %%mm5 \n\t"
784 "movq %%mm1, %%mm7 \n\t"
785 "movq 8(%%edi), %%mm6 \n\t"
786 "pandn %%mm6, %%mm7 \n\t"
787 "por %%mm7, %%mm5 \n\t"
788 "movq %%mm5, 8(%%edi) \n\t"
790 "addl $16, %%esi \n\t" // inc by 16 bytes processed
791 "addl $16, %%edi \n\t"
792 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
793 "ja mainloop16 \n\t"
795 "mainloop16end: \n\t"
796 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
797 "movl %%eax, %%ecx \n\t"
798 "cmpl $0, %%ecx \n\t"
799 "jz end16 \n\t"
800 // preload "movl mask, %%edx \n\t"
801 "sall $24, %%edx \n\t" // make low byte, high byte
803 "secondloop16: \n\t"
804 "sall %%edx \n\t" // move high bit to CF
805 "jnc skip16 \n\t" // if CF = 0
806 "movw (%%esi), %%ax \n\t"
807 "movw %%ax, (%%edi) \n\t"
809 "skip16: \n\t"
810 "addl $2, %%esi \n\t"
811 "addl $2, %%edi \n\t"
812 "decl %%ecx \n\t"
813 "jnz secondloop16 \n\t"
815 "end16: \n\t"
816 "EMMS \n\t" // DONE
818 : "=a" (dummy_value_a), // output regs (dummy)
819 "=c" (dummy_value_c),
820 "=d" (dummy_value_d),
821 "=S" (dummy_value_S),
822 "=D" (dummy_value_D)
824 : "0" (diff), // eax // input regs
825 // was (unmask) " " RESERVED // ebx // Global Offset Table idx
826 "1" (len), // ecx
827 "2" (mask), // edx
828 "3" (srcptr), // esi
829 "4" (dstptr) // edi
831 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
832 : "%mm0", "%mm1", "%mm4" // clobber list
833 , "%mm5", "%mm6", "%mm7"
834 #endif
835 );
836 }
837 else /* mmx _not supported - Use modified C routine */
838 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
839 {
840 register png_uint_32 i;
841 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
842 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
843 register int stride = BPP2 * png_pass_inc[png_ptr->pass];
844 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
845 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
846 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
847 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
848 int diff = (int) (png_ptr->width & 7); /* amount lost */
849 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
851 srcptr = png_ptr->row_buf + 1 + initial_val;
852 dstptr = row + initial_val;
854 for (i = initial_val; i < final_val; i += stride)
855 {
856 png_memcpy(dstptr, srcptr, rep_bytes);
857 srcptr += stride;
858 dstptr += stride;
859 }
860 if (diff) /* number of leftover pixels: 3 for pngtest */
861 {
862 final_val+=diff*BPP2;
863 for (; i < final_val; i += stride)
864 {
865 if (rep_bytes > (int)(final_val-i))
866 rep_bytes = (int)(final_val-i);
867 png_memcpy(dstptr, srcptr, rep_bytes);
868 srcptr += stride;
869 dstptr += stride;
870 }
871 }
872 } /* end of else (_mmx_supported) */
874 break;
875 } /* end 16 bpp */
877 case 24: /* png_ptr->row_info.pixel_depth */
878 {
879 png_bytep srcptr;
880 png_bytep dstptr;
882 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
883 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
884 /* && _mmx_supported */ )
885 {
886 png_uint_32 len;
887 int diff;
888 int dummy_value_a; // fix 'forbidden register spilled' error
889 int dummy_value_d;
890 int dummy_value_c;
891 int dummy_value_S;
892 int dummy_value_D;
893 _unmask = ~mask; // global variable for -fPIC version
894 srcptr = png_ptr->row_buf + 1;
895 dstptr = row;
896 len = png_ptr->width &~7; // reduce to multiple of 8
897 diff = (int) (png_ptr->width & 7); // amount lost //
899 __asm__ __volatile__ (
900 "movd _unmask, %%mm7 \n\t" // load bit pattern
901 "psubb %%mm6, %%mm6 \n\t" // zero mm6
902 "punpcklbw %%mm7, %%mm7 \n\t"
903 "punpcklwd %%mm7, %%mm7 \n\t"
904 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
906 "movq _mask24_0, %%mm0 \n\t"
907 "movq _mask24_1, %%mm1 \n\t"
908 "movq _mask24_2, %%mm2 \n\t"
910 "pand %%mm7, %%mm0 \n\t"
911 "pand %%mm7, %%mm1 \n\t"
912 "pand %%mm7, %%mm2 \n\t"
914 "pcmpeqb %%mm6, %%mm0 \n\t"
915 "pcmpeqb %%mm6, %%mm1 \n\t"
916 "pcmpeqb %%mm6, %%mm2 \n\t"
918 // preload "movl len, %%ecx \n\t" // load length of line
919 // preload "movl srcptr, %%esi \n\t" // load source
920 // preload "movl dstptr, %%edi \n\t" // load dest
922 "cmpl $0, %%ecx \n\t"
923 "jz mainloop24end \n\t"
925 "mainloop24: \n\t"
926 "movq (%%esi), %%mm4 \n\t"
927 "pand %%mm0, %%mm4 \n\t"
928 "movq %%mm0, %%mm6 \n\t"
929 "movq (%%edi), %%mm7 \n\t"
930 "pandn %%mm7, %%mm6 \n\t"
931 "por %%mm6, %%mm4 \n\t"
932 "movq %%mm4, (%%edi) \n\t"
934 "movq 8(%%esi), %%mm5 \n\t"
935 "pand %%mm1, %%mm5 \n\t"
936 "movq %%mm1, %%mm7 \n\t"
937 "movq 8(%%edi), %%mm6 \n\t"
938 "pandn %%mm6, %%mm7 \n\t"
939 "por %%mm7, %%mm5 \n\t"
940 "movq %%mm5, 8(%%edi) \n\t"
942 "movq 16(%%esi), %%mm6 \n\t"
943 "pand %%mm2, %%mm6 \n\t"
944 "movq %%mm2, %%mm4 \n\t"
945 "movq 16(%%edi), %%mm7 \n\t"
946 "pandn %%mm7, %%mm4 \n\t"
947 "por %%mm4, %%mm6 \n\t"
948 "movq %%mm6, 16(%%edi) \n\t"
950 "addl $24, %%esi \n\t" // inc by 24 bytes processed
951 "addl $24, %%edi \n\t"
952 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
954 "ja mainloop24 \n\t"
956 "mainloop24end: \n\t"
957 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
958 "movl %%eax, %%ecx \n\t"
959 "cmpl $0, %%ecx \n\t"
960 "jz end24 \n\t"
961 // preload "movl mask, %%edx \n\t"
962 "sall $24, %%edx \n\t" // make low byte, high byte
964 "secondloop24: \n\t"
965 "sall %%edx \n\t" // move high bit to CF
966 "jnc skip24 \n\t" // if CF = 0
967 "movw (%%esi), %%ax \n\t"
968 "movw %%ax, (%%edi) \n\t"
969 "xorl %%eax, %%eax \n\t"
970 "movb 2(%%esi), %%al \n\t"
971 "movb %%al, 2(%%edi) \n\t"
973 "skip24: \n\t"
974 "addl $3, %%esi \n\t"
975 "addl $3, %%edi \n\t"
976 "decl %%ecx \n\t"
977 "jnz secondloop24 \n\t"
979 "end24: \n\t"
980 "EMMS \n\t" // DONE
982 : "=a" (dummy_value_a), // output regs (dummy)
983 "=d" (dummy_value_d),
984 "=c" (dummy_value_c),
985 "=S" (dummy_value_S),
986 "=D" (dummy_value_D)
988 : "3" (srcptr), // esi // input regs
989 "4" (dstptr), // edi
990 "0" (diff), // eax
991 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
992 "2" (len), // ecx
993 "1" (mask) // edx
995 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
996 : "%mm0", "%mm1", "%mm2" // clobber list
997 , "%mm4", "%mm5", "%mm6", "%mm7"
998 #endif
999 );
1000 }
1001 else /* mmx _not supported - Use modified C routine */
1002 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1003 {
1004 register png_uint_32 i;
1005 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1006 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1007 register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1008 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1009 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1010 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1011 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1012 int diff = (int) (png_ptr->width & 7); /* amount lost */
1013 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1015 srcptr = png_ptr->row_buf + 1 + initial_val;
1016 dstptr = row + initial_val;
1018 for (i = initial_val; i < final_val; i += stride)
1019 {
1020 png_memcpy(dstptr, srcptr, rep_bytes);
1021 srcptr += stride;
1022 dstptr += stride;
1023 }
1024 if (diff) /* number of leftover pixels: 3 for pngtest */
1025 {
1026 final_val+=diff*BPP3;
1027 for (; i < final_val; i += stride)
1028 {
1029 if (rep_bytes > (int)(final_val-i))
1030 rep_bytes = (int)(final_val-i);
1031 png_memcpy(dstptr, srcptr, rep_bytes);
1032 srcptr += stride;
1033 dstptr += stride;
1034 }
1035 }
1036 } /* end of else (_mmx_supported) */
1038 break;
1039 } /* end 24 bpp */
1041 case 32: /* png_ptr->row_info.pixel_depth */
1042 {
1043 png_bytep srcptr;
1044 png_bytep dstptr;
1046 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1047 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1048 /* && _mmx_supported */ )
1049 {
1050 png_uint_32 len;
1051 int diff;
1052 int dummy_value_a; // fix 'forbidden register spilled' error
1053 int dummy_value_d;
1054 int dummy_value_c;
1055 int dummy_value_S;
1056 int dummy_value_D;
1057 _unmask = ~mask; // global variable for -fPIC version
1058 srcptr = png_ptr->row_buf + 1;
1059 dstptr = row;
1060 len = png_ptr->width &~7; // reduce to multiple of 8
1061 diff = (int) (png_ptr->width & 7); // amount lost //
1063 __asm__ __volatile__ (
1064 "movd _unmask, %%mm7 \n\t" // load bit pattern
1065 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1066 "punpcklbw %%mm7, %%mm7 \n\t"
1067 "punpcklwd %%mm7, %%mm7 \n\t"
1068 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1070 "movq _mask32_0, %%mm0 \n\t"
1071 "movq _mask32_1, %%mm1 \n\t"
1072 "movq _mask32_2, %%mm2 \n\t"
1073 "movq _mask32_3, %%mm3 \n\t"
1075 "pand %%mm7, %%mm0 \n\t"
1076 "pand %%mm7, %%mm1 \n\t"
1077 "pand %%mm7, %%mm2 \n\t"
1078 "pand %%mm7, %%mm3 \n\t"
1080 "pcmpeqb %%mm6, %%mm0 \n\t"
1081 "pcmpeqb %%mm6, %%mm1 \n\t"
1082 "pcmpeqb %%mm6, %%mm2 \n\t"
1083 "pcmpeqb %%mm6, %%mm3 \n\t"
1085 // preload "movl len, %%ecx \n\t" // load length of line
1086 // preload "movl srcptr, %%esi \n\t" // load source
1087 // preload "movl dstptr, %%edi \n\t" // load dest
1089 "cmpl $0, %%ecx \n\t" // lcr
1090 "jz mainloop32end \n\t"
1092 "mainloop32: \n\t"
1093 "movq (%%esi), %%mm4 \n\t"
1094 "pand %%mm0, %%mm4 \n\t"
1095 "movq %%mm0, %%mm6 \n\t"
1096 "movq (%%edi), %%mm7 \n\t"
1097 "pandn %%mm7, %%mm6 \n\t"
1098 "por %%mm6, %%mm4 \n\t"
1099 "movq %%mm4, (%%edi) \n\t"
1101 "movq 8(%%esi), %%mm5 \n\t"
1102 "pand %%mm1, %%mm5 \n\t"
1103 "movq %%mm1, %%mm7 \n\t"
1104 "movq 8(%%edi), %%mm6 \n\t"
1105 "pandn %%mm6, %%mm7 \n\t"
1106 "por %%mm7, %%mm5 \n\t"
1107 "movq %%mm5, 8(%%edi) \n\t"
1109 "movq 16(%%esi), %%mm6 \n\t"
1110 "pand %%mm2, %%mm6 \n\t"
1111 "movq %%mm2, %%mm4 \n\t"
1112 "movq 16(%%edi), %%mm7 \n\t"
1113 "pandn %%mm7, %%mm4 \n\t"
1114 "por %%mm4, %%mm6 \n\t"
1115 "movq %%mm6, 16(%%edi) \n\t"
1117 "movq 24(%%esi), %%mm7 \n\t"
1118 "pand %%mm3, %%mm7 \n\t"
1119 "movq %%mm3, %%mm5 \n\t"
1120 "movq 24(%%edi), %%mm4 \n\t"
1121 "pandn %%mm4, %%mm5 \n\t"
1122 "por %%mm5, %%mm7 \n\t"
1123 "movq %%mm7, 24(%%edi) \n\t"
1125 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1126 "addl $32, %%edi \n\t"
1127 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1128 "ja mainloop32 \n\t"
1130 "mainloop32end: \n\t"
1131 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1132 "movl %%eax, %%ecx \n\t"
1133 "cmpl $0, %%ecx \n\t"
1134 "jz end32 \n\t"
1135 // preload "movl mask, %%edx \n\t"
1136 "sall $24, %%edx \n\t" // low byte => high byte
1138 "secondloop32: \n\t"
1139 "sall %%edx \n\t" // move high bit to CF
1140 "jnc skip32 \n\t" // if CF = 0
1141 "movl (%%esi), %%eax \n\t"
1142 "movl %%eax, (%%edi) \n\t"
1144 "skip32: \n\t"
1145 "addl $4, %%esi \n\t"
1146 "addl $4, %%edi \n\t"
1147 "decl %%ecx \n\t"
1148 "jnz secondloop32 \n\t"
1150 "end32: \n\t"
1151 "EMMS \n\t" // DONE
1153 : "=a" (dummy_value_a), // output regs (dummy)
1154 "=d" (dummy_value_d),
1155 "=c" (dummy_value_c),
1156 "=S" (dummy_value_S),
1157 "=D" (dummy_value_D)
1159 : "3" (srcptr), // esi // input regs
1160 "4" (dstptr), // edi
1161 "0" (diff), // eax
1162 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1163 "2" (len), // ecx
1164 "1" (mask) // edx
1166 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1167 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1168 , "%mm4", "%mm5", "%mm6", "%mm7"
1169 #endif
1170 );
1171 }
1172 else /* mmx _not supported - Use modified C routine */
1173 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1174 {
1175 register png_uint_32 i;
1176 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1177 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1178 register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1179 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1180 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1181 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1182 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1183 int diff = (int) (png_ptr->width & 7); /* amount lost */
1184 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1186 srcptr = png_ptr->row_buf + 1 + initial_val;
1187 dstptr = row + initial_val;
1189 for (i = initial_val; i < final_val; i += stride)
1190 {
1191 png_memcpy(dstptr, srcptr, rep_bytes);
1192 srcptr += stride;
1193 dstptr += stride;
1194 }
1195 if (diff) /* number of leftover pixels: 3 for pngtest */
1196 {
1197 final_val+=diff*BPP4;
1198 for (; i < final_val; i += stride)
1199 {
1200 if (rep_bytes > (int)(final_val-i))
1201 rep_bytes = (int)(final_val-i);
1202 png_memcpy(dstptr, srcptr, rep_bytes);
1203 srcptr += stride;
1204 dstptr += stride;
1205 }
1206 }
1207 } /* end of else (_mmx_supported) */
1209 break;
1210 } /* end 32 bpp */
1212 case 48: /* png_ptr->row_info.pixel_depth */
1213 {
1214 png_bytep srcptr;
1215 png_bytep dstptr;
1217 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1218 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1219 /* && _mmx_supported */ )
1220 {
1221 png_uint_32 len;
1222 int diff;
1223 int dummy_value_a; // fix 'forbidden register spilled' error
1224 int dummy_value_d;
1225 int dummy_value_c;
1226 int dummy_value_S;
1227 int dummy_value_D;
1228 _unmask = ~mask; // global variable for -fPIC version
1229 srcptr = png_ptr->row_buf + 1;
1230 dstptr = row;
1231 len = png_ptr->width &~7; // reduce to multiple of 8
1232 diff = (int) (png_ptr->width & 7); // amount lost //
1234 __asm__ __volatile__ (
1235 "movd _unmask, %%mm7 \n\t" // load bit pattern
1236 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1237 "punpcklbw %%mm7, %%mm7 \n\t"
1238 "punpcklwd %%mm7, %%mm7 \n\t"
1239 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1241 "movq _mask48_0, %%mm0 \n\t"
1242 "movq _mask48_1, %%mm1 \n\t"
1243 "movq _mask48_2, %%mm2 \n\t"
1244 "movq _mask48_3, %%mm3 \n\t"
1245 "movq _mask48_4, %%mm4 \n\t"
1246 "movq _mask48_5, %%mm5 \n\t"
1248 "pand %%mm7, %%mm0 \n\t"
1249 "pand %%mm7, %%mm1 \n\t"
1250 "pand %%mm7, %%mm2 \n\t"
1251 "pand %%mm7, %%mm3 \n\t"
1252 "pand %%mm7, %%mm4 \n\t"
1253 "pand %%mm7, %%mm5 \n\t"
1255 "pcmpeqb %%mm6, %%mm0 \n\t"
1256 "pcmpeqb %%mm6, %%mm1 \n\t"
1257 "pcmpeqb %%mm6, %%mm2 \n\t"
1258 "pcmpeqb %%mm6, %%mm3 \n\t"
1259 "pcmpeqb %%mm6, %%mm4 \n\t"
1260 "pcmpeqb %%mm6, %%mm5 \n\t"
1262 // preload "movl len, %%ecx \n\t" // load length of line
1263 // preload "movl srcptr, %%esi \n\t" // load source
1264 // preload "movl dstptr, %%edi \n\t" // load dest
1266 "cmpl $0, %%ecx \n\t"
1267 "jz mainloop48end \n\t"
1269 "mainloop48: \n\t"
1270 "movq (%%esi), %%mm7 \n\t"
1271 "pand %%mm0, %%mm7 \n\t"
1272 "movq %%mm0, %%mm6 \n\t"
1273 "pandn (%%edi), %%mm6 \n\t"
1274 "por %%mm6, %%mm7 \n\t"
1275 "movq %%mm7, (%%edi) \n\t"
1277 "movq 8(%%esi), %%mm6 \n\t"
1278 "pand %%mm1, %%mm6 \n\t"
1279 "movq %%mm1, %%mm7 \n\t"
1280 "pandn 8(%%edi), %%mm7 \n\t"
1281 "por %%mm7, %%mm6 \n\t"
1282 "movq %%mm6, 8(%%edi) \n\t"
1284 "movq 16(%%esi), %%mm6 \n\t"
1285 "pand %%mm2, %%mm6 \n\t"
1286 "movq %%mm2, %%mm7 \n\t"
1287 "pandn 16(%%edi), %%mm7 \n\t"
1288 "por %%mm7, %%mm6 \n\t"
1289 "movq %%mm6, 16(%%edi) \n\t"
1291 "movq 24(%%esi), %%mm7 \n\t"
1292 "pand %%mm3, %%mm7 \n\t"
1293 "movq %%mm3, %%mm6 \n\t"
1294 "pandn 24(%%edi), %%mm6 \n\t"
1295 "por %%mm6, %%mm7 \n\t"
1296 "movq %%mm7, 24(%%edi) \n\t"
1298 "movq 32(%%esi), %%mm6 \n\t"
1299 "pand %%mm4, %%mm6 \n\t"
1300 "movq %%mm4, %%mm7 \n\t"
1301 "pandn 32(%%edi), %%mm7 \n\t"
1302 "por %%mm7, %%mm6 \n\t"
1303 "movq %%mm6, 32(%%edi) \n\t"
1305 "movq 40(%%esi), %%mm7 \n\t"
1306 "pand %%mm5, %%mm7 \n\t"
1307 "movq %%mm5, %%mm6 \n\t"
1308 "pandn 40(%%edi), %%mm6 \n\t"
1309 "por %%mm6, %%mm7 \n\t"
1310 "movq %%mm7, 40(%%edi) \n\t"
1312 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1313 "addl $48, %%edi \n\t"
1314 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1316 "ja mainloop48 \n\t"
1318 "mainloop48end: \n\t"
1319 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1320 "movl %%eax, %%ecx \n\t"
1321 "cmpl $0, %%ecx \n\t"
1322 "jz end48 \n\t"
1323 // preload "movl mask, %%edx \n\t"
1324 "sall $24, %%edx \n\t" // make low byte, high byte
1326 "secondloop48: \n\t"
1327 "sall %%edx \n\t" // move high bit to CF
1328 "jnc skip48 \n\t" // if CF = 0
1329 "movl (%%esi), %%eax \n\t"
1330 "movl %%eax, (%%edi) \n\t"
1332 "skip48: \n\t"
1333 "addl $4, %%esi \n\t"
1334 "addl $4, %%edi \n\t"
1335 "decl %%ecx \n\t"
1336 "jnz secondloop48 \n\t"
1338 "end48: \n\t"
1339 "EMMS \n\t" // DONE
1341 : "=a" (dummy_value_a), // output regs (dummy)
1342 "=d" (dummy_value_d),
1343 "=c" (dummy_value_c),
1344 "=S" (dummy_value_S),
1345 "=D" (dummy_value_D)
1347 : "3" (srcptr), // esi // input regs
1348 "4" (dstptr), // edi
1349 "0" (diff), // eax
1350 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1351 "2" (len), // ecx
1352 "1" (mask) // edx
1354 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1355 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1356 , "%mm4", "%mm5", "%mm6", "%mm7"
1357 #endif
1358 );
1359 }
1360 else /* mmx _not supported - Use modified C routine */
1361 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1362 {
1363 register png_uint_32 i;
1364 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1365 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1366 register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1367 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1368 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1369 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1370 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1371 int diff = (int) (png_ptr->width & 7); /* amount lost */
1372 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1374 srcptr = png_ptr->row_buf + 1 + initial_val;
1375 dstptr = row + initial_val;
1377 for (i = initial_val; i < final_val; i += stride)
1378 {
1379 png_memcpy(dstptr, srcptr, rep_bytes);
1380 srcptr += stride;
1381 dstptr += stride;
1382 }
1383 if (diff) /* number of leftover pixels: 3 for pngtest */
1384 {
1385 final_val+=diff*BPP6;
1386 for (; i < final_val; i += stride)
1387 {
1388 if (rep_bytes > (int)(final_val-i))
1389 rep_bytes = (int)(final_val-i);
1390 png_memcpy(dstptr, srcptr, rep_bytes);
1391 srcptr += stride;
1392 dstptr += stride;
1393 }
1394 }
1395 } /* end of else (_mmx_supported) */
1397 break;
1398 } /* end 48 bpp */
1400 case 64: /* png_ptr->row_info.pixel_depth */
1401 {
1402 png_bytep srcptr;
1403 png_bytep dstptr;
1404 register png_uint_32 i;
1405 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1406 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1407 register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1408 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1409 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1410 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1411 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1412 int diff = (int) (png_ptr->width & 7); /* amount lost */
1413 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1415 srcptr = png_ptr->row_buf + 1 + initial_val;
1416 dstptr = row + initial_val;
1418 for (i = initial_val; i < final_val; i += stride)
1419 {
1420 png_memcpy(dstptr, srcptr, rep_bytes);
1421 srcptr += stride;
1422 dstptr += stride;
1423 }
1424 if (diff) /* number of leftover pixels: 3 for pngtest */
1425 {
1426 final_val+=diff*BPP8;
1427 for (; i < final_val; i += stride)
1428 {
1429 if (rep_bytes > (int)(final_val-i))
1430 rep_bytes = (int)(final_val-i);
1431 png_memcpy(dstptr, srcptr, rep_bytes);
1432 srcptr += stride;
1433 dstptr += stride;
1434 }
1435 }
1437 break;
1438 } /* end 64 bpp */
1440 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1441 {
1442 /* this should never happen */
1443 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1444 break;
1445 }
1446 } /* end switch (png_ptr->row_info.pixel_depth) */
1448 } /* end if (non-trivial mask) */
1450 } /* end png_combine_row() */
1452 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1457 /*===========================================================================*/
1458 /* */
1459 /* P N G _ D O _ R E A D _ I N T E R L A C E */
1460 /* */
1461 /*===========================================================================*/
1463 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1464 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1466 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1467 * has taken place. [GRR: what other steps come before and/or after?]
1468 */
1470 void /* PRIVATE */
1471 png_do_read_interlace(png_structp png_ptr)
1472 {
1473 png_row_infop row_info = &(png_ptr->row_info);
1474 png_bytep row = png_ptr->row_buf + 1;
1475 int pass = png_ptr->pass;
1476 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1477 png_uint_32 transformations = png_ptr->transformations;
1478 #endif
1480 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1482 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1483 if (_mmx_supported == 2) {
1484 /* this should have happened in png_init_mmx_flags() already */
1485 png_warning(png_ptr, "asm_flags may not have been initialized");
1486 png_mmx_support();
1487 }
1488 #endif
1490 if (row != NULL && row_info != NULL)
1491 {
1492 png_uint_32 final_width;
1494 final_width = row_info->width * png_pass_inc[pass];
1496 switch (row_info->pixel_depth)
1497 {
1498 case 1:
1499 {
1500 png_bytep sp, dp;
1501 int sshift, dshift;
1502 int s_start, s_end, s_inc;
1503 png_byte v;
1504 png_uint_32 i;
1505 int j;
1507 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1508 dp = row + (png_size_t)((final_width - 1) >> 3);
1509 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1510 if (transformations & PNG_PACKSWAP)
1511 {
1512 sshift = (int)((row_info->width + 7) & 7);
1513 dshift = (int)((final_width + 7) & 7);
1514 s_start = 7;
1515 s_end = 0;
1516 s_inc = -1;
1517 }
1518 else
1519 #endif
1520 {
1521 sshift = 7 - (int)((row_info->width + 7) & 7);
1522 dshift = 7 - (int)((final_width + 7) & 7);
1523 s_start = 0;
1524 s_end = 7;
1525 s_inc = 1;
1526 }
1528 for (i = row_info->width; i; i--)
1529 {
1530 v = (png_byte)((*sp >> sshift) & 0x1);
1531 for (j = 0; j < png_pass_inc[pass]; j++)
1532 {
1533 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1534 *dp |= (png_byte)(v << dshift);
1535 if (dshift == s_end)
1536 {
1537 dshift = s_start;
1538 dp--;
1539 }
1540 else
1541 dshift += s_inc;
1542 }
1543 if (sshift == s_end)
1544 {
1545 sshift = s_start;
1546 sp--;
1547 }
1548 else
1549 sshift += s_inc;
1550 }
1551 break;
1552 }
1554 case 2:
1555 {
1556 png_bytep sp, dp;
1557 int sshift, dshift;
1558 int s_start, s_end, s_inc;
1559 png_uint_32 i;
1561 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1562 dp = row + (png_size_t)((final_width - 1) >> 2);
1563 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1564 if (transformations & PNG_PACKSWAP)
1565 {
1566 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1567 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1568 s_start = 6;
1569 s_end = 0;
1570 s_inc = -2;
1571 }
1572 else
1573 #endif
1574 {
1575 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1576 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1577 s_start = 0;
1578 s_end = 6;
1579 s_inc = 2;
1580 }
1582 for (i = row_info->width; i; i--)
1583 {
1584 png_byte v;
1585 int j;
1587 v = (png_byte)((*sp >> sshift) & 0x3);
1588 for (j = 0; j < png_pass_inc[pass]; j++)
1589 {
1590 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1591 *dp |= (png_byte)(v << dshift);
1592 if (dshift == s_end)
1593 {
1594 dshift = s_start;
1595 dp--;
1596 }
1597 else
1598 dshift += s_inc;
1599 }
1600 if (sshift == s_end)
1601 {
1602 sshift = s_start;
1603 sp--;
1604 }
1605 else
1606 sshift += s_inc;
1607 }
1608 break;
1609 }
1611 case 4:
1612 {
1613 png_bytep sp, dp;
1614 int sshift, dshift;
1615 int s_start, s_end, s_inc;
1616 png_uint_32 i;
1618 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1619 dp = row + (png_size_t)((final_width - 1) >> 1);
1620 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1621 if (transformations & PNG_PACKSWAP)
1622 {
1623 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1624 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1625 s_start = 4;
1626 s_end = 0;
1627 s_inc = -4;
1628 }
1629 else
1630 #endif
1631 {
1632 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1633 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1634 s_start = 0;
1635 s_end = 4;
1636 s_inc = 4;
1637 }
1639 for (i = row_info->width; i; i--)
1640 {
1641 png_byte v;
1642 int j;
1644 v = (png_byte)((*sp >> sshift) & 0xf);
1645 for (j = 0; j < png_pass_inc[pass]; j++)
1646 {
1647 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1648 *dp |= (png_byte)(v << dshift);
1649 if (dshift == s_end)
1650 {
1651 dshift = s_start;
1652 dp--;
1653 }
1654 else
1655 dshift += s_inc;
1656 }
1657 if (sshift == s_end)
1658 {
1659 sshift = s_start;
1660 sp--;
1661 }
1662 else
1663 sshift += s_inc;
1664 }
1665 break;
1666 }
1668 /*====================================================================*/
1670 default: /* 8-bit or larger (this is where the routine is modified) */
1671 {
1672 #if 0
1673 // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1674 // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1675 // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1676 // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1677 #endif
1678 png_bytep sptr, dp;
1679 png_uint_32 i;
1680 png_size_t pixel_bytes;
1681 int width = (int)row_info->width;
1683 pixel_bytes = (row_info->pixel_depth >> 3);
1685 /* point sptr at the last pixel in the pre-expanded row: */
1686 sptr = row + (width - 1) * pixel_bytes;
1688 /* point dp at the last pixel position in the expanded row: */
1689 dp = row + (final_width - 1) * pixel_bytes;
1691 /* New code by Nirav Chhatrapati - Intel Corporation */
1693 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1694 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1695 /* && _mmx_supported */ )
1696 {
1697 //--------------------------------------------------------------
1698 if (pixel_bytes == 3)
1699 {
1700 if (((pass == 0) || (pass == 1)) && width)
1701 {
1702 int dummy_value_c; // fix 'forbidden register spilled'
1703 int dummy_value_S;
1704 int dummy_value_D;
1706 __asm__ __volatile__ (
1707 "subl $21, %%edi \n\t"
1708 // (png_pass_inc[pass] - 1)*pixel_bytes
1710 ".loop3_pass0: \n\t"
1711 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1712 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1713 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1714 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1715 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1716 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1717 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1718 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1719 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1720 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1721 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1722 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1723 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1724 "movq %%mm4, 16(%%edi) \n\t"
1725 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1726 "movq %%mm3, 8(%%edi) \n\t"
1727 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1728 "subl $3, %%esi \n\t"
1729 "movq %%mm0, (%%edi) \n\t"
1730 "subl $24, %%edi \n\t"
1731 "decl %%ecx \n\t"
1732 "jnz .loop3_pass0 \n\t"
1733 "EMMS \n\t" // DONE
1735 : "=c" (dummy_value_c), // output regs (dummy)
1736 "=S" (dummy_value_S),
1737 "=D" (dummy_value_D)
1739 : "1" (sptr), // esi // input regs
1740 "2" (dp), // edi
1741 "0" (width) // ecx
1742 // doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4)
1744 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1745 : "%mm0", "%mm1", "%mm2" // clobber list
1746 , "%mm3", "%mm4"
1747 #endif
1748 );
1749 }
1750 else if (((pass == 2) || (pass == 3)) && width)
1751 {
1752 int dummy_value_c; // fix 'forbidden register spilled'
1753 int dummy_value_S;
1754 int dummy_value_D;
1756 __asm__ __volatile__ (
1757 "subl $9, %%edi \n\t"
1758 // (png_pass_inc[pass] - 1)*pixel_bytes
1760 ".loop3_pass2: \n\t"
1761 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1762 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1763 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1764 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1765 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1766 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1767 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1768 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1769 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1770 "movq %%mm0, 4(%%edi) \n\t"
1771 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1772 "subl $3, %%esi \n\t"
1773 "movd %%mm0, (%%edi) \n\t"
1774 "subl $12, %%edi \n\t"
1775 "decl %%ecx \n\t"
1776 "jnz .loop3_pass2 \n\t"
1777 "EMMS \n\t" // DONE
1779 : "=c" (dummy_value_c), // output regs (dummy)
1780 "=S" (dummy_value_S),
1781 "=D" (dummy_value_D)
1783 : "1" (sptr), // esi // input regs
1784 "2" (dp), // edi
1785 "0" (width) // ecx
1787 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1788 : "%mm0", "%mm1", "%mm2" // clobber list
1789 #endif
1790 );
1791 }
1792 else if (width) /* && ((pass == 4) || (pass == 5)) */
1793 {
1794 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1795 if (width_mmx < 0)
1796 width_mmx = 0;
1797 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1798 if (width_mmx)
1799 {
1800 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1801 // sptr points at last pixel in pre-expanded row
1802 // dp points at last pixel position in expanded row
1803 int dummy_value_c; // fix 'forbidden register spilled'
1804 int dummy_value_S;
1805 int dummy_value_D;
1807 __asm__ __volatile__ (
1808 "subl $3, %%esi \n\t"
1809 "subl $9, %%edi \n\t"
1810 // (png_pass_inc[pass] + 1)*pixel_bytes
1812 ".loop3_pass4: \n\t"
1813 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1814 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1815 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1816 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1817 "pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
1818 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1819 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1820 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1821 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1822 "movq %%mm0, (%%edi) \n\t"
1823 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1824 "pand _const6, %%mm3 \n\t" // z z z z z z z 5
1825 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1826 "subl $6, %%esi \n\t"
1827 "movd %%mm2, 8(%%edi) \n\t"
1828 "subl $12, %%edi \n\t"
1829 "subl $2, %%ecx \n\t"
1830 "jnz .loop3_pass4 \n\t"
1831 "EMMS \n\t" // DONE
1833 : "=c" (dummy_value_c), // output regs (dummy)
1834 "=S" (dummy_value_S),
1835 "=D" (dummy_value_D)
1837 : "1" (sptr), // esi // input regs
1838 "2" (dp), // edi
1839 "0" (width_mmx) // ecx
1841 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1842 : "%mm0", "%mm1" // clobber list
1843 , "%mm2", "%mm3"
1844 #endif
1845 );
1846 }
1848 sptr -= width_mmx*3;
1849 dp -= width_mmx*6;
1850 for (i = width; i; i--)
1851 {
1852 png_byte v[8];
1853 int j;
1855 png_memcpy(v, sptr, 3);
1856 for (j = 0; j < png_pass_inc[pass]; j++)
1857 {
1858 png_memcpy(dp, v, 3);
1859 dp -= 3;
1860 }
1861 sptr -= 3;
1862 }
1863 }
1864 } /* end of pixel_bytes == 3 */
1866 //--------------------------------------------------------------
1867 else if (pixel_bytes == 1)
1868 {
1869 if (((pass == 0) || (pass == 1)) && width)
1870 {
1871 int width_mmx = ((width >> 2) << 2);
1872 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1873 if (width_mmx)
1874 {
1875 int dummy_value_c; // fix 'forbidden register spilled'
1876 int dummy_value_S;
1877 int dummy_value_D;
1879 __asm__ __volatile__ (
1880 "subl $3, %%esi \n\t"
1881 "subl $31, %%edi \n\t"
1883 ".loop1_pass0: \n\t"
1884 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1885 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1886 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1887 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1888 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1889 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1890 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1891 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1892 "movq %%mm0, (%%edi) \n\t"
1893 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1894 "movq %%mm3, 8(%%edi) \n\t"
1895 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1896 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1897 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1898 "movq %%mm2, 16(%%edi) \n\t"
1899 "subl $4, %%esi \n\t"
1900 "movq %%mm4, 24(%%edi) \n\t"
1901 "subl $32, %%edi \n\t"
1902 "subl $4, %%ecx \n\t"
1903 "jnz .loop1_pass0 \n\t"
1904 "EMMS \n\t" // DONE
1906 : "=c" (dummy_value_c), // output regs (dummy)
1907 "=S" (dummy_value_S),
1908 "=D" (dummy_value_D)
1910 : "1" (sptr), // esi // input regs
1911 "2" (dp), // edi
1912 "0" (width_mmx) // ecx
1914 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1915 : "%mm0", "%mm1", "%mm2" // clobber list
1916 , "%mm3", "%mm4"
1917 #endif
1918 );
1919 }
1921 sptr -= width_mmx;
1922 dp -= width_mmx*8;
1923 for (i = width; i; i--)
1924 {
1925 int j;
1927 /* I simplified this part in version 1.0.4e
1928 * here and in several other instances where
1929 * pixel_bytes == 1 -- GR-P
1930 *
1931 * Original code:
1932 *
1933 * png_byte v[8];
1934 * png_memcpy(v, sptr, pixel_bytes);
1935 * for (j = 0; j < png_pass_inc[pass]; j++)
1936 * {
1937 * png_memcpy(dp, v, pixel_bytes);
1938 * dp -= pixel_bytes;
1939 * }
1940 * sptr -= pixel_bytes;
1941 *
1942 * Replacement code is in the next three lines:
1943 */
1945 for (j = 0; j < png_pass_inc[pass]; j++)
1946 {
1947 *dp-- = *sptr;
1948 }
1949 --sptr;
1950 }
1951 }
1952 else if (((pass == 2) || (pass == 3)) && width)
1953 {
1954 int width_mmx = ((width >> 2) << 2);
1955 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1956 if (width_mmx)
1957 {
1958 int dummy_value_c; // fix 'forbidden register spilled'
1959 int dummy_value_S;
1960 int dummy_value_D;
1962 __asm__ __volatile__ (
1963 "subl $3, %%esi \n\t"
1964 "subl $15, %%edi \n\t"
1966 ".loop1_pass2: \n\t"
1967 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1968 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1969 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
1970 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1971 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
1972 "movq %%mm0, (%%edi) \n\t"
1973 "subl $4, %%esi \n\t"
1974 "movq %%mm1, 8(%%edi) \n\t"
1975 "subl $16, %%edi \n\t"
1976 "subl $4, %%ecx \n\t"
1977 "jnz .loop1_pass2 \n\t"
1978 "EMMS \n\t" // DONE
1980 : "=c" (dummy_value_c), // output regs (dummy)
1981 "=S" (dummy_value_S),
1982 "=D" (dummy_value_D)
1984 : "1" (sptr), // esi // input regs
1985 "2" (dp), // edi
1986 "0" (width_mmx) // ecx
1988 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1989 : "%mm0", "%mm1" // clobber list
1990 #endif
1991 );
1992 }
1994 sptr -= width_mmx;
1995 dp -= width_mmx*4;
1996 for (i = width; i; i--)
1997 {
1998 int j;
2000 for (j = 0; j < png_pass_inc[pass]; j++)
2001 {
2002 *dp-- = *sptr;
2003 }
2004 --sptr;
2005 }
2006 }
2007 else if (width) /* && ((pass == 4) || (pass == 5)) */
2008 {
2009 int width_mmx = ((width >> 3) << 3);
2010 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2011 if (width_mmx)
2012 {
2013 int dummy_value_c; // fix 'forbidden register spilled'
2014 int dummy_value_S;
2015 int dummy_value_D;
2017 __asm__ __volatile__ (
2018 "subl $7, %%esi \n\t"
2019 "subl $15, %%edi \n\t"
2021 ".loop1_pass4: \n\t"
2022 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2023 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2024 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2025 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2026 "movq %%mm1, 8(%%edi) \n\t"
2027 "subl $8, %%esi \n\t"
2028 "movq %%mm0, (%%edi) \n\t"
2029 "subl $16, %%edi \n\t"
2030 "subl $8, %%ecx \n\t"
2031 "jnz .loop1_pass4 \n\t"
2032 "EMMS \n\t" // DONE
2034 : "=c" (dummy_value_c), // output regs (none)
2035 "=S" (dummy_value_S),
2036 "=D" (dummy_value_D)
2038 : "1" (sptr), // esi // input regs
2039 "2" (dp), // edi
2040 "0" (width_mmx) // ecx
2042 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2043 : "%mm0", "%mm1" // clobber list
2044 #endif
2045 );
2046 }
2048 sptr -= width_mmx;
2049 dp -= width_mmx*2;
2050 for (i = width; i; i--)
2051 {
2052 int j;
2054 for (j = 0; j < png_pass_inc[pass]; j++)
2055 {
2056 *dp-- = *sptr;
2057 }
2058 --sptr;
2059 }
2060 }
2061 } /* end of pixel_bytes == 1 */
2063 //--------------------------------------------------------------
2064 else if (pixel_bytes == 2)
2065 {
2066 if (((pass == 0) || (pass == 1)) && width)
2067 {
2068 int width_mmx = ((width >> 1) << 1);
2069 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2070 if (width_mmx)
2071 {
2072 int dummy_value_c; // fix 'forbidden register spilled'
2073 int dummy_value_S;
2074 int dummy_value_D;
2076 __asm__ __volatile__ (
2077 "subl $2, %%esi \n\t"
2078 "subl $30, %%edi \n\t"
2080 ".loop2_pass0: \n\t"
2081 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2082 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2083 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2084 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2085 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2086 "movq %%mm0, (%%edi) \n\t"
2087 "movq %%mm0, 8(%%edi) \n\t"
2088 "movq %%mm1, 16(%%edi) \n\t"
2089 "subl $4, %%esi \n\t"
2090 "movq %%mm1, 24(%%edi) \n\t"
2091 "subl $32, %%edi \n\t"
2092 "subl $2, %%ecx \n\t"
2093 "jnz .loop2_pass0 \n\t"
2094 "EMMS \n\t" // DONE
2096 : "=c" (dummy_value_c), // output regs (dummy)
2097 "=S" (dummy_value_S),
2098 "=D" (dummy_value_D)
2100 : "1" (sptr), // esi // input regs
2101 "2" (dp), // edi
2102 "0" (width_mmx) // ecx
2104 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2105 : "%mm0", "%mm1" // clobber list
2106 #endif
2107 );
2108 }
2110 sptr -= (width_mmx*2 - 2); // sign fixed
2111 dp -= (width_mmx*16 - 2); // sign fixed
2112 for (i = width; i; i--)
2113 {
2114 png_byte v[8];
2115 int j;
2116 sptr -= 2;
2117 png_memcpy(v, sptr, 2);
2118 for (j = 0; j < png_pass_inc[pass]; j++)
2119 {
2120 dp -= 2;
2121 png_memcpy(dp, v, 2);
2122 }
2123 }
2124 }
2125 else if (((pass == 2) || (pass == 3)) && width)
2126 {
2127 int width_mmx = ((width >> 1) << 1) ;
2128 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2129 if (width_mmx)
2130 {
2131 int dummy_value_c; // fix 'forbidden register spilled'
2132 int dummy_value_S;
2133 int dummy_value_D;
2135 __asm__ __volatile__ (
2136 "subl $2, %%esi \n\t"
2137 "subl $14, %%edi \n\t"
2139 ".loop2_pass2: \n\t"
2140 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2141 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2142 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2143 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2144 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2145 "movq %%mm0, (%%edi) \n\t"
2146 "subl $4, %%esi \n\t"
2147 "movq %%mm1, 8(%%edi) \n\t"
2148 "subl $16, %%edi \n\t"
2149 "subl $2, %%ecx \n\t"
2150 "jnz .loop2_pass2 \n\t"
2151 "EMMS \n\t" // DONE
2153 : "=c" (dummy_value_c), // output regs (dummy)
2154 "=S" (dummy_value_S),
2155 "=D" (dummy_value_D)
2157 : "1" (sptr), // esi // input regs
2158 "2" (dp), // edi
2159 "0" (width_mmx) // ecx
2161 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2162 : "%mm0", "%mm1" // clobber list
2163 #endif
2164 );
2165 }
2167 sptr -= (width_mmx*2 - 2); // sign fixed
2168 dp -= (width_mmx*8 - 2); // sign fixed
2169 for (i = width; i; i--)
2170 {
2171 png_byte v[8];
2172 int j;
2173 sptr -= 2;
2174 png_memcpy(v, sptr, 2);
2175 for (j = 0; j < png_pass_inc[pass]; j++)
2176 {
2177 dp -= 2;
2178 png_memcpy(dp, v, 2);
2179 }
2180 }
2181 }
2182 else if (width) // pass == 4 or 5
2183 {
2184 int width_mmx = ((width >> 1) << 1) ;
2185 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2186 if (width_mmx)
2187 {
2188 int dummy_value_c; // fix 'forbidden register spilled'
2189 int dummy_value_S;
2190 int dummy_value_D;
2192 __asm__ __volatile__ (
2193 "subl $2, %%esi \n\t"
2194 "subl $6, %%edi \n\t"
2196 ".loop2_pass4: \n\t"
2197 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2198 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2199 "subl $4, %%esi \n\t"
2200 "movq %%mm0, (%%edi) \n\t"
2201 "subl $8, %%edi \n\t"
2202 "subl $2, %%ecx \n\t"
2203 "jnz .loop2_pass4 \n\t"
2204 "EMMS \n\t" // DONE
2206 : "=c" (dummy_value_c), // output regs (dummy)
2207 "=S" (dummy_value_S),
2208 "=D" (dummy_value_D)
2210 : "1" (sptr), // esi // input regs
2211 "2" (dp), // edi
2212 "0" (width_mmx) // ecx
2214 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2215 : "%mm0" // clobber list
2216 #endif
2217 );
2218 }
2220 sptr -= (width_mmx*2 - 2); // sign fixed
2221 dp -= (width_mmx*4 - 2); // sign fixed
2222 for (i = width; i; i--)
2223 {
2224 png_byte v[8];
2225 int j;
2226 sptr -= 2;
2227 png_memcpy(v, sptr, 2);
2228 for (j = 0; j < png_pass_inc[pass]; j++)
2229 {
2230 dp -= 2;
2231 png_memcpy(dp, v, 2);
2232 }
2233 }
2234 }
2235 } /* end of pixel_bytes == 2 */
2237 //--------------------------------------------------------------
2238 else if (pixel_bytes == 4)
2239 {
2240 if (((pass == 0) || (pass == 1)) && width)
2241 {
2242 int width_mmx = ((width >> 1) << 1);
2243 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2244 if (width_mmx)
2245 {
2246 int dummy_value_c; // fix 'forbidden register spilled'
2247 int dummy_value_S;
2248 int dummy_value_D;
2250 __asm__ __volatile__ (
2251 "subl $4, %%esi \n\t"
2252 "subl $60, %%edi \n\t"
2254 ".loop4_pass0: \n\t"
2255 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2256 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2257 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2258 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2259 "movq %%mm0, (%%edi) \n\t"
2260 "movq %%mm0, 8(%%edi) \n\t"
2261 "movq %%mm0, 16(%%edi) \n\t"
2262 "movq %%mm0, 24(%%edi) \n\t"
2263 "movq %%mm1, 32(%%edi) \n\t"
2264 "movq %%mm1, 40(%%edi) \n\t"
2265 "movq %%mm1, 48(%%edi) \n\t"
2266 "subl $8, %%esi \n\t"
2267 "movq %%mm1, 56(%%edi) \n\t"
2268 "subl $64, %%edi \n\t"
2269 "subl $2, %%ecx \n\t"
2270 "jnz .loop4_pass0 \n\t"
2271 "EMMS \n\t" // DONE
2273 : "=c" (dummy_value_c), // output regs (dummy)
2274 "=S" (dummy_value_S),
2275 "=D" (dummy_value_D)
2277 : "1" (sptr), // esi // input regs
2278 "2" (dp), // edi
2279 "0" (width_mmx) // ecx
2281 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2282 : "%mm0", "%mm1" // clobber list
2283 #endif
2284 );
2285 }
2287 sptr -= (width_mmx*4 - 4); // sign fixed
2288 dp -= (width_mmx*32 - 4); // sign fixed
2289 for (i = width; i; i--)
2290 {
2291 png_byte v[8];
2292 int j;
2293 sptr -= 4;
2294 png_memcpy(v, sptr, 4);
2295 for (j = 0; j < png_pass_inc[pass]; j++)
2296 {
2297 dp -= 4;
2298 png_memcpy(dp, v, 4);
2299 }
2300 }
2301 }
2302 else if (((pass == 2) || (pass == 3)) && width)
2303 {
2304 int width_mmx = ((width >> 1) << 1);
2305 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2306 if (width_mmx)
2307 {
2308 int dummy_value_c; // fix 'forbidden register spilled'
2309 int dummy_value_S;
2310 int dummy_value_D;
2312 __asm__ __volatile__ (
2313 "subl $4, %%esi \n\t"
2314 "subl $28, %%edi \n\t"
2316 ".loop4_pass2: \n\t"
2317 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2318 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2319 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2320 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2321 "movq %%mm0, (%%edi) \n\t"
2322 "movq %%mm0, 8(%%edi) \n\t"
2323 "movq %%mm1, 16(%%edi) \n\t"
2324 "movq %%mm1, 24(%%edi) \n\t"
2325 "subl $8, %%esi \n\t"
2326 "subl $32, %%edi \n\t"
2327 "subl $2, %%ecx \n\t"
2328 "jnz .loop4_pass2 \n\t"
2329 "EMMS \n\t" // DONE
2331 : "=c" (dummy_value_c), // output regs (dummy)
2332 "=S" (dummy_value_S),
2333 "=D" (dummy_value_D)
2335 : "1" (sptr), // esi // input regs
2336 "2" (dp), // edi
2337 "0" (width_mmx) // ecx
2339 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2340 : "%mm0", "%mm1" // clobber list
2341 #endif
2342 );
2343 }
2345 sptr -= (width_mmx*4 - 4); // sign fixed
2346 dp -= (width_mmx*16 - 4); // sign fixed
2347 for (i = width; i; i--)
2348 {
2349 png_byte v[8];
2350 int j;
2351 sptr -= 4;
2352 png_memcpy(v, sptr, 4);
2353 for (j = 0; j < png_pass_inc[pass]; j++)
2354 {
2355 dp -= 4;
2356 png_memcpy(dp, v, 4);
2357 }
2358 }
2359 }
2360 else if (width) // pass == 4 or 5
2361 {
2362 int width_mmx = ((width >> 1) << 1) ;
2363 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2364 if (width_mmx)
2365 {
2366 int dummy_value_c; // fix 'forbidden register spilled'
2367 int dummy_value_S;
2368 int dummy_value_D;
2370 __asm__ __volatile__ (
2371 "subl $4, %%esi \n\t"
2372 "subl $12, %%edi \n\t"
2374 ".loop4_pass4: \n\t"
2375 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2376 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2377 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2378 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2379 "movq %%mm0, (%%edi) \n\t"
2380 "subl $8, %%esi \n\t"
2381 "movq %%mm1, 8(%%edi) \n\t"
2382 "subl $16, %%edi \n\t"
2383 "subl $2, %%ecx \n\t"
2384 "jnz .loop4_pass4 \n\t"
2385 "EMMS \n\t" // DONE
2387 : "=c" (dummy_value_c), // output regs (dummy)
2388 "=S" (dummy_value_S),
2389 "=D" (dummy_value_D)
2391 : "1" (sptr), // esi // input regs
2392 "2" (dp), // edi
2393 "0" (width_mmx) // ecx
2395 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2396 : "%mm0", "%mm1" // clobber list
2397 #endif
2398 );
2399 }
2401 sptr -= (width_mmx*4 - 4); // sign fixed
2402 dp -= (width_mmx*8 - 4); // sign fixed
2403 for (i = width; i; i--)
2404 {
2405 png_byte v[8];
2406 int j;
2407 sptr -= 4;
2408 png_memcpy(v, sptr, 4);
2409 for (j = 0; j < png_pass_inc[pass]; j++)
2410 {
2411 dp -= 4;
2412 png_memcpy(dp, v, 4);
2413 }
2414 }
2415 }
2416 } /* end of pixel_bytes == 4 */
2418 //--------------------------------------------------------------
2419 else if (pixel_bytes == 8)
2420 {
2421 // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2422 // GRR NOTE: no need to combine passes here!
2423 if (((pass == 0) || (pass == 1)) && width)
2424 {
2425 int dummy_value_c; // fix 'forbidden register spilled'
2426 int dummy_value_S;
2427 int dummy_value_D;
2429 // source is 8-byte RRGGBBAA
2430 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2431 __asm__ __volatile__ (
2432 "subl $56, %%edi \n\t" // start of last block
2434 ".loop8_pass0: \n\t"
2435 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2436 "movq %%mm0, (%%edi) \n\t"
2437 "movq %%mm0, 8(%%edi) \n\t"
2438 "movq %%mm0, 16(%%edi) \n\t"
2439 "movq %%mm0, 24(%%edi) \n\t"
2440 "movq %%mm0, 32(%%edi) \n\t"
2441 "movq %%mm0, 40(%%edi) \n\t"
2442 "movq %%mm0, 48(%%edi) \n\t"
2443 "subl $8, %%esi \n\t"
2444 "movq %%mm0, 56(%%edi) \n\t"
2445 "subl $64, %%edi \n\t"
2446 "decl %%ecx \n\t"
2447 "jnz .loop8_pass0 \n\t"
2448 "EMMS \n\t" // DONE
2450 : "=c" (dummy_value_c), // output regs (dummy)
2451 "=S" (dummy_value_S),
2452 "=D" (dummy_value_D)
2454 : "1" (sptr), // esi // input regs
2455 "2" (dp), // edi
2456 "0" (width) // ecx
2458 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2459 : "%mm0" // clobber list
2460 #endif
2461 );
2462 }
2463 else if (((pass == 2) || (pass == 3)) && width)
2464 {
2465 // source is 8-byte RRGGBBAA
2466 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2467 int width_mmx = ((width >> 1) << 1) ;
2468 width -= width_mmx;
2469 if (width_mmx)
2470 {
2471 int dummy_value_c; // fix 'forbidden register spilled'
2472 int dummy_value_S;
2473 int dummy_value_D;
2475 __asm__ __volatile__ (
2476 "subl $24, %%edi \n\t" // start of last block
2478 ".loop8_pass2: \n\t"
2479 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2480 "movq %%mm0, (%%edi) \n\t"
2481 "movq %%mm0, 8(%%edi) \n\t"
2482 "movq %%mm0, 16(%%edi) \n\t"
2483 "subl $8, %%esi \n\t"
2484 "movq %%mm0, 24(%%edi) \n\t"
2485 "subl $32, %%edi \n\t"
2486 "decl %%ecx \n\t"
2487 "jnz .loop8_pass2 \n\t"
2488 "EMMS \n\t" // DONE
2490 : "=c" (dummy_value_c), // output regs (dummy)
2491 "=S" (dummy_value_S),
2492 "=D" (dummy_value_D)
2494 : "1" (sptr), // esi // input regs
2495 "2" (dp), // edi
2496 "0" (width) // ecx
2498 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2499 : "%mm0" // clobber list
2500 #endif
2501 );
2502 }
2503 }
2504 else if (width) // pass == 4 or 5
2505 {
2506 // source is 8-byte RRGGBBAA
2507 // dest is 16-byte RRGGBBAA RRGGBBAA
2508 int width_mmx = ((width >> 1) << 1) ;
2509 width -= width_mmx;
2510 if (width_mmx)
2511 {
2512 int dummy_value_c; // fix 'forbidden register spilled'
2513 int dummy_value_S;
2514 int dummy_value_D;
2516 __asm__ __volatile__ (
2517 "subl $8, %%edi \n\t" // start of last block
2519 ".loop8_pass4: \n\t"
2520 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2521 "movq %%mm0, (%%edi) \n\t"
2522 "subl $8, %%esi \n\t"
2523 "movq %%mm0, 8(%%edi) \n\t"
2524 "subl $16, %%edi \n\t"
2525 "decl %%ecx \n\t"
2526 "jnz .loop8_pass4 \n\t"
2527 "EMMS \n\t" // DONE
2529 : "=c" (dummy_value_c), // output regs (dummy)
2530 "=S" (dummy_value_S),
2531 "=D" (dummy_value_D)
2533 : "1" (sptr), // esi // input regs
2534 "2" (dp), // edi
2535 "0" (width) // ecx
2537 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2538 : "%mm0" // clobber list
2539 #endif
2540 );
2541 }
2542 }
2544 } /* end of pixel_bytes == 8 */
2546 //--------------------------------------------------------------
2547 else if (pixel_bytes == 6)
2548 {
2549 for (i = width; i; i--)
2550 {
2551 png_byte v[8];
2552 int j;
2553 png_memcpy(v, sptr, 6);
2554 for (j = 0; j < png_pass_inc[pass]; j++)
2555 {
2556 png_memcpy(dp, v, 6);
2557 dp -= 6;
2558 }
2559 sptr -= 6;
2560 }
2561 } /* end of pixel_bytes == 6 */
2563 //--------------------------------------------------------------
2564 else
2565 {
2566 for (i = width; i; i--)
2567 {
2568 png_byte v[8];
2569 int j;
2570 png_memcpy(v, sptr, pixel_bytes);
2571 for (j = 0; j < png_pass_inc[pass]; j++)
2572 {
2573 png_memcpy(dp, v, pixel_bytes);
2574 dp -= pixel_bytes;
2575 }
2576 sptr-= pixel_bytes;
2577 }
2578 }
2579 } // end of _mmx_supported ========================================
2581 else /* MMX not supported: use modified C code - takes advantage
2582 * of inlining of png_memcpy for a constant */
2583 /* GRR 19991007: does it? or should pixel_bytes in each
2584 * block be replaced with immediate value (e.g., 1)? */
2585 /* GRR 19991017: replaced with constants in each case */
2586 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2587 {
2588 if (pixel_bytes == 1)
2589 {
2590 for (i = width; i; i--)
2591 {
2592 int j;
2593 for (j = 0; j < png_pass_inc[pass]; j++)
2594 {
2595 *dp-- = *sptr;
2596 }
2597 --sptr;
2598 }
2599 }
2600 else if (pixel_bytes == 3)
2601 {
2602 for (i = width; i; i--)
2603 {
2604 png_byte v[8];
2605 int j;
2606 png_memcpy(v, sptr, 3);
2607 for (j = 0; j < png_pass_inc[pass]; j++)
2608 {
2609 png_memcpy(dp, v, 3);
2610 dp -= 3;
2611 }
2612 sptr -= 3;
2613 }
2614 }
2615 else if (pixel_bytes == 2)
2616 {
2617 for (i = width; i; i--)
2618 {
2619 png_byte v[8];
2620 int j;
2621 png_memcpy(v, sptr, 2);
2622 for (j = 0; j < png_pass_inc[pass]; j++)
2623 {
2624 png_memcpy(dp, v, 2);
2625 dp -= 2;
2626 }
2627 sptr -= 2;
2628 }
2629 }
2630 else if (pixel_bytes == 4)
2631 {
2632 for (i = width; i; i--)
2633 {
2634 png_byte v[8];
2635 int j;
2636 png_memcpy(v, sptr, 4);
2637 for (j = 0; j < png_pass_inc[pass]; j++)
2638 {
2639 #ifdef PNG_DEBUG
2640 if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2641 {
2642 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2643 row, dp, row+png_ptr->row_buf_size);
2644 printf("row_buf=%d\n",png_ptr->row_buf_size);
2645 }
2646 #endif
2647 png_memcpy(dp, v, 4);
2648 dp -= 4;
2649 }
2650 sptr -= 4;
2651 }
2652 }
2653 else if (pixel_bytes == 6)
2654 {
2655 for (i = width; i; i--)
2656 {
2657 png_byte v[8];
2658 int j;
2659 png_memcpy(v, sptr, 6);
2660 for (j = 0; j < png_pass_inc[pass]; j++)
2661 {
2662 png_memcpy(dp, v, 6);
2663 dp -= 6;
2664 }
2665 sptr -= 6;
2666 }
2667 }
2668 else if (pixel_bytes == 8)
2669 {
2670 for (i = width; i; i--)
2671 {
2672 png_byte v[8];
2673 int j;
2674 png_memcpy(v, sptr, 8);
2675 for (j = 0; j < png_pass_inc[pass]; j++)
2676 {
2677 png_memcpy(dp, v, 8);
2678 dp -= 8;
2679 }
2680 sptr -= 8;
2681 }
2682 }
2683 else /* GRR: should never be reached */
2684 {
2685 for (i = width; i; i--)
2686 {
2687 png_byte v[8];
2688 int j;
2689 png_memcpy(v, sptr, pixel_bytes);
2690 for (j = 0; j < png_pass_inc[pass]; j++)
2691 {
2692 png_memcpy(dp, v, pixel_bytes);
2693 dp -= pixel_bytes;
2694 }
2695 sptr -= pixel_bytes;
2696 }
2697 }
2699 } /* end if (MMX not supported) */
2700 break;
2701 }
2702 } /* end switch (row_info->pixel_depth) */
2704 row_info->width = final_width;
2705 row_info->rowbytes = ((final_width *
2706 (png_uint_32)row_info->pixel_depth + 7) >> 3);
2707 }
2709 } /* end png_do_read_interlace() */
2711 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2712 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2716 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2717 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2719 // These variables are utilized in the functions below. They are declared
2720 // globally here to ensure alignment on 8-byte boundaries.
2722 union uAll {
2723 long long use;
2724 double align;
2725 } _LBCarryMask = {0x0101010101010101LL},
2726 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2727 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2729 #ifdef PNG_THREAD_UNSAFE_OK
2730 //===========================================================================//
2731 // //
2732 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2733 // //
2734 //===========================================================================//
2736 // Optimized code for PNG Average filter decoder
2738 static void /* PRIVATE */
2739 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2740 png_bytep prev_row)
2741 {
2742 int bpp;
2743 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2744 int dummy_value_S;
2745 int dummy_value_D;
2747 bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2748 _FullLength = row_info->rowbytes; // # of bytes to filter
2750 __asm__ __volatile__ (
2751 // initialize address pointers and offset
2752 #ifdef __PIC__
2753 "pushl %%ebx \n\t" // save index to Global Offset Table
2754 #endif
2755 //pre "movl row, %%edi \n\t" // edi: Avg(x)
2756 "xorl %%ebx, %%ebx \n\t" // ebx: x
2757 "movl %%edi, %%edx \n\t"
2758 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2759 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2760 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2762 "xorl %%eax,%%eax \n\t"
2764 // Compute the Raw value for the first bpp bytes
2765 // Raw(x) = Avg(x) + (Prior(x)/2)
2766 "avg_rlp: \n\t"
2767 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2768 "incl %%ebx \n\t"
2769 "shrb %%al \n\t" // divide by 2
2770 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2771 //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2772 "cmpl %%ecx, %%ebx \n\t"
2773 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2774 "jb avg_rlp \n\t" // mov does not affect flags
2776 // get # of bytes to alignment
2777 "movl %%edi, _dif \n\t" // take start of row
2778 "addl %%ebx, _dif \n\t" // add bpp
2779 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2780 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2781 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2782 "jz avg_go \n\t" // alignment
2784 // fix alignment
2785 // Compute the Raw value for the bytes up to the alignment boundary
2786 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2787 "xorl %%ecx, %%ecx \n\t"
2789 "avg_lp1: \n\t"
2790 "xorl %%eax, %%eax \n\t"
2791 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2792 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2793 "addw %%cx, %%ax \n\t"
2794 "incl %%ebx \n\t"
2795 "shrw %%ax \n\t" // divide by 2
2796 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2797 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2798 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2799 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2801 "avg_go: \n\t"
2802 "movl _FullLength, %%eax \n\t"
2803 "movl %%eax, %%ecx \n\t"
2804 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2805 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2806 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2807 "movl %%ecx, _MMXLength \n\t"
2808 #ifdef __PIC__
2809 "popl %%ebx \n\t" // restore index to Global Offset Table
2810 #endif
2812 : "=c" (dummy_value_c), // output regs (dummy)
2813 "=S" (dummy_value_S),
2814 "=D" (dummy_value_D)
2816 : "0" (bpp), // ecx // input regs
2817 "1" (prev_row), // esi
2818 "2" (row) // edi
2820 : "%eax", "%edx" // clobber list
2821 #ifndef __PIC__
2822 , "%ebx"
2823 #endif
2824 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2825 // (seems to work fine without...)
2826 );
2828 // now do the math for the rest of the row
2829 switch (bpp)
2830 {
2831 case 3:
2832 {
2833 _ActiveMask.use = 0x0000000000ffffffLL;
2834 _ShiftBpp.use = 24; // == 3 * 8
2835 _ShiftRem.use = 40; // == 64 - 24
2837 __asm__ __volatile__ (
2838 // re-init address pointers and offset
2839 "movq _ActiveMask, %%mm7 \n\t"
2840 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2841 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2842 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2843 "movq _HBClearMask, %%mm4 \n\t"
2844 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2846 // prime the pump: load the first Raw(x-bpp) data set
2847 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2848 // (correct pos. in loop below)
2849 "avg_3lp: \n\t"
2850 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2851 "movq %%mm5, %%mm3 \n\t"
2852 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2853 // data
2854 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2855 "movq %%mm7, %%mm6 \n\t"
2856 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2857 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2858 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2859 // byte
2860 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2861 // each byte
2862 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2863 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2864 // LBCarrys
2865 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2866 // where both
2867 // lsb's were == 1 (only valid for active group)
2868 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2869 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2870 // byte
2871 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2872 // for each byte
2873 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2874 // bytes to add to Avg
2875 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2876 // Avg for each Active
2877 // byte
2878 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2879 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2880 // bytes 3-5
2881 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2882 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2883 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2884 // LBCarrys
2885 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2886 // where both
2887 // lsb's were == 1 (only valid for active group)
2888 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2889 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2890 // byte
2891 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2892 // for each byte
2893 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2894 // bytes to add to Avg
2895 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2896 // Avg for each Active
2897 // byte
2899 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2900 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2901 // two
2902 // bytes
2903 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2904 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2905 // Data only needs to be shifted once here to
2906 // get the correct x-bpp offset.
2907 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2908 // LBCarrys
2909 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2910 // where both
2911 // lsb's were == 1 (only valid for active group)
2912 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2913 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2914 // byte
2915 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2916 // for each byte
2917 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2918 // bytes to add to Avg
2919 "addl $8, %%ecx \n\t"
2920 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2921 // Avg for each Active
2922 // byte
2923 // now ready to write back to memory
2924 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2925 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2926 "cmpl _MMXLength, %%ecx \n\t"
2927 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2928 "jb avg_3lp \n\t"
2930 : "=S" (dummy_value_S), // output regs (dummy)
2931 "=D" (dummy_value_D)
2933 : "0" (prev_row), // esi // input regs
2934 "1" (row) // edi
2936 : "%ecx" // clobber list
2937 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2938 , "%mm0", "%mm1", "%mm2", "%mm3"
2939 , "%mm4", "%mm5", "%mm6", "%mm7"
2940 #endif
2941 );
2942 }
2943 break; // end 3 bpp
2945 case 6:
2946 case 4:
2947 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2948 //case 5: // GRR BOGUS
2949 {
2950 _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2951 // appropriate inactive bytes
2952 _ShiftBpp.use = bpp << 3;
2953 _ShiftRem.use = 64 - _ShiftBpp.use;
2955 __asm__ __volatile__ (
2956 "movq _HBClearMask, %%mm4 \n\t"
2958 // re-init address pointers and offset
2959 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2960 // alignment boundary
2962 // load _ActiveMask and clear all bytes except for 1st active group
2963 "movq _ActiveMask, %%mm7 \n\t"
2964 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2965 "psrlq _ShiftRem, %%mm7 \n\t"
2966 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2967 "movq %%mm7, %%mm6 \n\t"
2968 "movq _LBCarryMask, %%mm5 \n\t"
2969 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
2970 // group
2972 // prime the pump: load the first Raw(x-bpp) data set
2973 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2974 // (we correct pos. in loop below)
2975 "avg_4lp: \n\t"
2976 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2977 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
2978 "movq (%%esi,%%ecx,), %%mm1 \n\t"
2979 // add (Prev_row/2) to average
2980 "movq %%mm5, %%mm3 \n\t"
2981 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2982 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2983 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2984 // byte
2985 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2986 // each byte
2987 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2988 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2989 // LBCarrys
2990 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2991 // where both
2992 // lsb's were == 1 (only valid for active group)
2993 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2994 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2995 // byte
2996 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2997 // for each byte
2998 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
2999 // bytes to add to Avg
3000 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3001 // for each Active
3002 // byte
3003 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3004 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3005 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3006 "addl $8, %%ecx \n\t"
3007 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3008 // LBCarrys
3009 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3010 // where both
3011 // lsb's were == 1 (only valid for active group)
3012 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3013 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3014 // byte
3015 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3016 // for each byte
3017 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3018 // bytes to add to Avg
3019 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3020 // Avg for each Active
3021 // byte
3022 "cmpl _MMXLength, %%ecx \n\t"
3023 // now ready to write back to memory
3024 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3025 // prep Raw(x-bpp) for next loop
3026 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3027 "jb avg_4lp \n\t"
3029 : "=S" (dummy_value_S), // output regs (dummy)
3030 "=D" (dummy_value_D)
3032 : "0" (prev_row), // esi // input regs
3033 "1" (row) // edi
3035 : "%ecx" // clobber list
3036 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3037 , "%mm0", "%mm1", "%mm2", "%mm3"
3038 , "%mm4", "%mm5", "%mm6", "%mm7"
3039 #endif
3040 );
3041 }
3042 break; // end 4,6 bpp
3044 case 2:
3045 {
3046 _ActiveMask.use = 0x000000000000ffffLL;
3047 _ShiftBpp.use = 16; // == 2 * 8
3048 _ShiftRem.use = 48; // == 64 - 16
3050 __asm__ __volatile__ (
3051 // load _ActiveMask
3052 "movq _ActiveMask, %%mm7 \n\t"
3053 // re-init address pointers and offset
3054 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3055 // boundary
3056 "movq _LBCarryMask, %%mm5 \n\t"
3057 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3058 "movq _HBClearMask, %%mm4 \n\t"
3059 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3061 // prime the pump: load the first Raw(x-bpp) data set
3062 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3063 // (we correct pos. in loop below)
3064 "avg_2lp: \n\t"
3065 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3066 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3067 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3068 // add (Prev_row/2) to average
3069 "movq %%mm5, %%mm3 \n\t"
3070 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3071 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3072 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3073 // byte
3074 "movq %%mm7, %%mm6 \n\t"
3075 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3076 // each byte
3078 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3079 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3080 // LBCarrys
3081 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3082 // where both
3083 // lsb's were == 1 (only valid
3084 // for active group)
3085 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3086 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3087 // byte
3088 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3089 // for each byte
3090 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3091 // bytes to add to Avg
3092 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3093 // for each Active byte
3095 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3096 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3097 // bytes 2 & 3
3098 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3099 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3100 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3101 // LBCarrys
3102 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3103 // where both
3104 // lsb's were == 1 (only valid
3105 // for active group)
3106 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3107 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3108 // byte
3109 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3110 // for each byte
3111 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3112 // bytes to add to Avg
3113 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3114 // Avg for each Active byte
3116 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3117 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3118 // bytes 4 & 5
3119 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3120 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3121 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3122 // LBCarrys
3123 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3124 // where both lsb's were == 1
3125 // (only valid for active group)
3126 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3127 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3128 // byte
3129 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3130 // for each byte
3131 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3132 // bytes to add to Avg
3133 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3134 // Avg for each Active byte
3136 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3137 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3138 // bytes 6 & 7
3139 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3140 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3141 "addl $8, %%ecx \n\t"
3142 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3143 // LBCarrys
3144 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3145 // where both
3146 // lsb's were == 1 (only valid
3147 // for active group)
3148 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3149 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3150 // byte
3151 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3152 // for each byte
3153 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3154 // bytes to add to Avg
3155 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3156 // Avg for each Active byte
3158 "cmpl _MMXLength, %%ecx \n\t"
3159 // now ready to write back to memory
3160 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3161 // prep Raw(x-bpp) for next loop
3162 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3163 "jb avg_2lp \n\t"
3165 : "=S" (dummy_value_S), // output regs (dummy)
3166 "=D" (dummy_value_D)
3168 : "0" (prev_row), // esi // input regs
3169 "1" (row) // edi
3171 : "%ecx" // clobber list
3172 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3173 , "%mm0", "%mm1", "%mm2", "%mm3"
3174 , "%mm4", "%mm5", "%mm6", "%mm7"
3175 #endif
3176 );
3177 }
3178 break; // end 2 bpp
3180 case 1:
3181 {
3182 __asm__ __volatile__ (
3183 // re-init address pointers and offset
3184 #ifdef __PIC__
3185 "pushl %%ebx \n\t" // save Global Offset Table index
3186 #endif
3187 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3188 // boundary
3189 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3190 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3191 "jnb avg_1end \n\t"
3192 // do Paeth decode for remaining bytes
3193 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3194 "movl %%edi, %%edx \n\t"
3195 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3196 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3197 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3198 // in loop below
3199 "avg_1lp: \n\t"
3200 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3201 "xorl %%eax, %%eax \n\t"
3202 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3203 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3204 "addw %%cx, %%ax \n\t"
3205 "incl %%ebx \n\t"
3206 "shrw %%ax \n\t" // divide by 2
3207 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3208 // inc ebx
3209 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3210 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3211 // mov does not affect flags; -1 to offset inc ebx
3212 "jb avg_1lp \n\t"
3214 "avg_1end: \n\t"
3215 #ifdef __PIC__
3216 "popl %%ebx \n\t" // Global Offset Table index
3217 #endif
3219 : "=c" (dummy_value_c), // output regs (dummy)
3220 "=S" (dummy_value_S),
3221 "=D" (dummy_value_D)
3223 : "0" (bpp), // ecx // input regs
3224 "1" (prev_row), // esi
3225 "2" (row) // edi
3227 : "%eax", "%edx" // clobber list
3228 #ifndef __PIC__
3229 , "%ebx"
3230 #endif
3231 );
3232 }
3233 return; // end 1 bpp
3235 case 8:
3236 {
3237 __asm__ __volatile__ (
3238 // re-init address pointers and offset
3239 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3240 "movq _LBCarryMask, %%mm5 \n\t" // boundary
3241 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3242 "movq _HBClearMask, %%mm4 \n\t"
3243 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3245 // prime the pump: load the first Raw(x-bpp) data set
3246 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3247 // (NO NEED to correct pos. in loop below)
3249 "avg_8lp: \n\t"
3250 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3251 "movq %%mm5, %%mm3 \n\t"
3252 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3253 "addl $8, %%ecx \n\t"
3254 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3255 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3256 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3257 // where both lsb's were == 1
3258 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3259 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3260 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3261 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3262 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3263 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3264 "cmpl _MMXLength, %%ecx \n\t"
3265 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3266 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3267 "jb avg_8lp \n\t"
3269 : "=S" (dummy_value_S), // output regs (dummy)
3270 "=D" (dummy_value_D)
3272 : "0" (prev_row), // esi // input regs
3273 "1" (row) // edi
3275 : "%ecx" // clobber list
3276 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3277 , "%mm0", "%mm1", "%mm2"
3278 , "%mm3", "%mm4", "%mm5"
3279 #endif
3280 );
3281 }
3282 break; // end 8 bpp
3284 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3285 {
3287 #ifdef PNG_DEBUG
3288 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3289 png_debug(1,
3290 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3291 #endif
3293 #if 0
3294 __asm__ __volatile__ (
3295 "movq _LBCarryMask, %%mm5 \n\t"
3296 // re-init address pointers and offset
3297 "movl _dif, %%ebx \n\t" // ebx: x = offset to
3298 // alignment boundary
3299 "movl row, %%edi \n\t" // edi: Avg(x)
3300 "movq _HBClearMask, %%mm4 \n\t"
3301 "movl %%edi, %%edx \n\t"
3302 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3303 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3304 "avg_Alp: \n\t"
3305 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3306 "movq %%mm5, %%mm3 \n\t"
3307 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3308 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3309 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3310 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3311 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3312 // where both lsb's were == 1
3313 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3314 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3315 // byte
3316 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3317 // byte
3318 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3319 // byte
3320 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3321 // each byte
3322 "addl $8, %%ebx \n\t"
3323 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3324 // byte
3325 "cmpl _MMXLength, %%ebx \n\t"
3326 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3327 "jb avg_Alp \n\t"
3329 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3331 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3333 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3334 );
3335 #endif /* 0 - NEVER REACHED */
3336 }
3337 break;
3339 } // end switch (bpp)
3341 __asm__ __volatile__ (
3342 // MMX acceleration complete; now do clean-up
3343 // check if any remaining bytes left to decode
3344 #ifdef __PIC__
3345 "pushl %%ebx \n\t" // save index to Global Offset Table
3346 #endif
3347 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3348 //pre "movl row, %%edi \n\t" // edi: Avg(x)
3349 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3350 "jnb avg_end \n\t"
3352 // do Avg decode for remaining bytes
3353 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3354 "movl %%edi, %%edx \n\t"
3355 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3356 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3357 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3359 "avg_lp2: \n\t"
3360 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3361 "xorl %%eax, %%eax \n\t"
3362 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3363 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3364 "addw %%cx, %%ax \n\t"
3365 "incl %%ebx \n\t"
3366 "shrw %%ax \n\t" // divide by 2
3367 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3368 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3369 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3370 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3372 "avg_end: \n\t"
3373 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3374 #ifdef __PIC__
3375 "popl %%ebx \n\t" // restore index to Global Offset Table
3376 #endif
3378 : "=c" (dummy_value_c), // output regs (dummy)
3379 "=S" (dummy_value_S),
3380 "=D" (dummy_value_D)
3382 : "0" (bpp), // ecx // input regs
3383 "1" (prev_row), // esi
3384 "2" (row) // edi
3386 : "%eax", "%edx" // clobber list
3387 #ifndef __PIC__
3388 , "%ebx"
3389 #endif
3390 );
3392 } /* end png_read_filter_row_mmx_avg() */
3393 #endif
3397 #ifdef PNG_THREAD_UNSAFE_OK
3398 //===========================================================================//
3399 // //
3400 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3401 // //
3402 //===========================================================================//
3404 // Optimized code for PNG Paeth filter decoder
3406 static void /* PRIVATE */
3407 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3408 png_bytep prev_row)
3409 {
3410 int bpp;
3411 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3412 int dummy_value_S;
3413 int dummy_value_D;
3415 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3416 _FullLength = row_info->rowbytes; // # of bytes to filter
3418 __asm__ __volatile__ (
3419 #ifdef __PIC__
3420 "pushl %%ebx \n\t" // save index to Global Offset Table
3421 #endif
3422 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3423 //pre "movl row, %%edi \n\t"
3424 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3425 //pre "movl prev_row, %%esi \n\t"
3426 "xorl %%eax, %%eax \n\t"
3428 // Compute the Raw value for the first bpp bytes
3429 // Note: the formula works out to be always
3430 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3431 "paeth_rlp: \n\t"
3432 "movb (%%edi,%%ebx,), %%al \n\t"
3433 "addb (%%esi,%%ebx,), %%al \n\t"
3434 "incl %%ebx \n\t"
3435 //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3436 "cmpl %%ecx, %%ebx \n\t"
3437 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3438 "jb paeth_rlp \n\t"
3439 // get # of bytes to alignment
3440 "movl %%edi, _dif \n\t" // take start of row
3441 "addl %%ebx, _dif \n\t" // add bpp
3442 "xorl %%ecx, %%ecx \n\t"
3443 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3444 // boundary
3445 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3446 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3447 // at alignment
3448 "jz paeth_go \n\t"
3449 // fix alignment
3451 "paeth_lp1: \n\t"
3452 "xorl %%eax, %%eax \n\t"
3453 // pav = p - a = (a + b - c) - a = b - c
3454 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3455 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3456 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3457 "movl %%eax, _patemp \n\t" // Save pav for later use
3458 "xorl %%eax, %%eax \n\t"
3459 // pbv = p - b = (a + b - c) - b = a - c
3460 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3461 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3462 "movl %%eax, %%ecx \n\t"
3463 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3464 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3465 // pc = abs(pcv)
3466 "testl $0x80000000, %%eax \n\t"
3467 "jz paeth_pca \n\t"
3468 "negl %%eax \n\t" // reverse sign of neg values
3470 "paeth_pca: \n\t"
3471 "movl %%eax, _pctemp \n\t" // save pc for later use
3472 // pb = abs(pbv)
3473 "testl $0x80000000, %%ecx \n\t"
3474 "jz paeth_pba \n\t"
3475 "negl %%ecx \n\t" // reverse sign of neg values
3477 "paeth_pba: \n\t"
3478 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3479 // pa = abs(pav)
3480 "movl _patemp, %%eax \n\t"
3481 "testl $0x80000000, %%eax \n\t"
3482 "jz paeth_paa \n\t"
3483 "negl %%eax \n\t" // reverse sign of neg values
3485 "paeth_paa: \n\t"
3486 "movl %%eax, _patemp \n\t" // save pa for later use
3487 // test if pa <= pb
3488 "cmpl %%ecx, %%eax \n\t"
3489 "jna paeth_abb \n\t"
3490 // pa > pb; now test if pb <= pc
3491 "cmpl _pctemp, %%ecx \n\t"
3492 "jna paeth_bbc \n\t"
3493 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3494 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3495 "jmp paeth_paeth \n\t"
3497 "paeth_bbc: \n\t"
3498 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3499 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3500 "jmp paeth_paeth \n\t"
3502 "paeth_abb: \n\t"
3503 // pa <= pb; now test if pa <= pc
3504 "cmpl _pctemp, %%eax \n\t"
3505 "jna paeth_abc \n\t"
3506 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3507 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3508 "jmp paeth_paeth \n\t"
3510 "paeth_abc: \n\t"
3511 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3512 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3514 "paeth_paeth: \n\t"
3515 "incl %%ebx \n\t"
3516 "incl %%edx \n\t"
3517 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3518 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3519 "cmpl _dif, %%ebx \n\t"
3520 "jb paeth_lp1 \n\t"
3522 "paeth_go: \n\t"
3523 "movl _FullLength, %%ecx \n\t"
3524 "movl %%ecx, %%eax \n\t"
3525 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3526 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3527 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3528 "movl %%ecx, _MMXLength \n\t"
3529 #ifdef __PIC__
3530 "popl %%ebx \n\t" // restore index to Global Offset Table
3531 #endif
3533 : "=c" (dummy_value_c), // output regs (dummy)
3534 "=S" (dummy_value_S),
3535 "=D" (dummy_value_D)
3537 : "0" (bpp), // ecx // input regs
3538 "1" (prev_row), // esi
3539 "2" (row) // edi
3541 : "%eax", "%edx" // clobber list
3542 #ifndef __PIC__
3543 , "%ebx"
3544 #endif
3545 );
3547 // now do the math for the rest of the row
3548 switch (bpp)
3549 {
3550 case 3:
3551 {
3552 _ActiveMask.use = 0x0000000000ffffffLL;
3553 _ActiveMaskEnd.use = 0xffff000000000000LL;
3554 _ShiftBpp.use = 24; // == bpp(3) * 8
3555 _ShiftRem.use = 40; // == 64 - 24
3557 __asm__ __volatile__ (
3558 "movl _dif, %%ecx \n\t"
3559 // preload "movl row, %%edi \n\t"
3560 // preload "movl prev_row, %%esi \n\t"
3561 "pxor %%mm0, %%mm0 \n\t"
3562 // prime the pump: load the first Raw(x-bpp) data set
3563 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3564 "paeth_3lp: \n\t"
3565 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3566 // 3 bytes
3567 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3568 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3569 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3570 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3571 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3572 // 3 bytes
3573 // pav = p - a = (a + b - c) - a = b - c
3574 "movq %%mm2, %%mm4 \n\t"
3575 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3576 // pbv = p - b = (a + b - c) - b = a - c
3577 "movq %%mm1, %%mm5 \n\t"
3578 "psubw %%mm3, %%mm4 \n\t"
3579 "pxor %%mm7, %%mm7 \n\t"
3580 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3581 "movq %%mm4, %%mm6 \n\t"
3582 "psubw %%mm3, %%mm5 \n\t"
3584 // pa = abs(p-a) = abs(pav)
3585 // pb = abs(p-b) = abs(pbv)
3586 // pc = abs(p-c) = abs(pcv)
3587 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3588 "paddw %%mm5, %%mm6 \n\t"
3589 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3590 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3591 "psubw %%mm0, %%mm4 \n\t"
3592 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3593 "psubw %%mm0, %%mm4 \n\t"
3594 "psubw %%mm7, %%mm5 \n\t"
3595 "pxor %%mm0, %%mm0 \n\t"
3596 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3597 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3598 "psubw %%mm7, %%mm5 \n\t"
3599 "psubw %%mm0, %%mm6 \n\t"
3600 // test pa <= pb
3601 "movq %%mm4, %%mm7 \n\t"
3602 "psubw %%mm0, %%mm6 \n\t"
3603 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3604 "movq %%mm7, %%mm0 \n\t"
3605 // use mm7 mask to merge pa & pb
3606 "pand %%mm7, %%mm5 \n\t"
3607 // use mm0 mask copy to merge a & b
3608 "pand %%mm0, %%mm2 \n\t"
3609 "pandn %%mm4, %%mm7 \n\t"
3610 "pandn %%mm1, %%mm0 \n\t"
3611 "paddw %%mm5, %%mm7 \n\t"
3612 "paddw %%mm2, %%mm0 \n\t"
3613 // test ((pa <= pb)? pa:pb) <= pc
3614 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3615 "pxor %%mm1, %%mm1 \n\t"
3616 "pand %%mm7, %%mm3 \n\t"
3617 "pandn %%mm0, %%mm7 \n\t"
3618 "paddw %%mm3, %%mm7 \n\t"
3619 "pxor %%mm0, %%mm0 \n\t"
3620 "packuswb %%mm1, %%mm7 \n\t"
3621 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3622 "pand _ActiveMask, %%mm7 \n\t"
3623 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3624 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3625 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3626 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3627 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3628 // Raw(x-bpp)
3629 // now do Paeth for 2nd set of bytes (3-5)
3630 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3631 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3632 "pxor %%mm7, %%mm7 \n\t"
3633 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3634 // pbv = p - b = (a + b - c) - b = a - c
3635 "movq %%mm1, %%mm5 \n\t"
3636 // pav = p - a = (a + b - c) - a = b - c
3637 "movq %%mm2, %%mm4 \n\t"
3638 "psubw %%mm3, %%mm5 \n\t"
3639 "psubw %%mm3, %%mm4 \n\t"
3640 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3641 // pav + pbv = pbv + pav
3642 "movq %%mm5, %%mm6 \n\t"
3643 "paddw %%mm4, %%mm6 \n\t"
3645 // pa = abs(p-a) = abs(pav)
3646 // pb = abs(p-b) = abs(pbv)
3647 // pc = abs(p-c) = abs(pcv)
3648 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3649 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3650 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3651 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3652 "psubw %%mm0, %%mm5 \n\t"
3653 "psubw %%mm7, %%mm4 \n\t"
3654 "psubw %%mm0, %%mm5 \n\t"
3655 "psubw %%mm7, %%mm4 \n\t"
3656 "pxor %%mm0, %%mm0 \n\t"
3657 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3658 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3659 "psubw %%mm0, %%mm6 \n\t"
3660 // test pa <= pb
3661 "movq %%mm4, %%mm7 \n\t"
3662 "psubw %%mm0, %%mm6 \n\t"
3663 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3664 "movq %%mm7, %%mm0 \n\t"
3665 // use mm7 mask to merge pa & pb
3666 "pand %%mm7, %%mm5 \n\t"
3667 // use mm0 mask copy to merge a & b
3668 "pand %%mm0, %%mm2 \n\t"
3669 "pandn %%mm4, %%mm7 \n\t"
3670 "pandn %%mm1, %%mm0 \n\t"
3671 "paddw %%mm5, %%mm7 \n\t"
3672 "paddw %%mm2, %%mm0 \n\t"
3673 // test ((pa <= pb)? pa:pb) <= pc
3674 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3675 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3676 "pand %%mm7, %%mm3 \n\t"
3677 "pandn %%mm0, %%mm7 \n\t"
3678 "pxor %%mm1, %%mm1 \n\t"
3679 "paddw %%mm3, %%mm7 \n\t"
3680 "pxor %%mm0, %%mm0 \n\t"
3681 "packuswb %%mm1, %%mm7 \n\t"
3682 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3683 "pand _ActiveMask, %%mm7 \n\t"
3684 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3685 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3686 // 3 bytes
3687 // pav = p - a = (a + b - c) - a = b - c
3688 "movq %%mm2, %%mm4 \n\t"
3689 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3690 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3691 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3692 "movq %%mm7, %%mm1 \n\t"
3693 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3694 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3695 // now mm1 will be used as Raw(x-bpp)
3696 // now do Paeth for 3rd, and final, set of bytes (6-7)
3697 "pxor %%mm7, %%mm7 \n\t"
3698 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3699 "psubw %%mm3, %%mm4 \n\t"
3700 // pbv = p - b = (a + b - c) - b = a - c
3701 "movq %%mm1, %%mm5 \n\t"
3702 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3703 "movq %%mm4, %%mm6 \n\t"
3704 "psubw %%mm3, %%mm5 \n\t"
3705 "pxor %%mm0, %%mm0 \n\t"
3706 "paddw %%mm5, %%mm6 \n\t"
3708 // pa = abs(p-a) = abs(pav)
3709 // pb = abs(p-b) = abs(pbv)
3710 // pc = abs(p-c) = abs(pcv)
3711 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3712 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3713 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3714 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3715 "psubw %%mm0, %%mm4 \n\t"
3716 "psubw %%mm7, %%mm5 \n\t"
3717 "psubw %%mm0, %%mm4 \n\t"
3718 "psubw %%mm7, %%mm5 \n\t"
3719 "pxor %%mm0, %%mm0 \n\t"
3720 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3721 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3722 "psubw %%mm0, %%mm6 \n\t"
3723 // test pa <= pb
3724 "movq %%mm4, %%mm7 \n\t"
3725 "psubw %%mm0, %%mm6 \n\t"
3726 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3727 "movq %%mm7, %%mm0 \n\t"
3728 // use mm0 mask copy to merge a & b
3729 "pand %%mm0, %%mm2 \n\t"
3730 // use mm7 mask to merge pa & pb
3731 "pand %%mm7, %%mm5 \n\t"
3732 "pandn %%mm1, %%mm0 \n\t"
3733 "pandn %%mm4, %%mm7 \n\t"
3734 "paddw %%mm2, %%mm0 \n\t"
3735 "paddw %%mm5, %%mm7 \n\t"
3736 // test ((pa <= pb)? pa:pb) <= pc
3737 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3738 "pand %%mm7, %%mm3 \n\t"
3739 "pandn %%mm0, %%mm7 \n\t"
3740 "paddw %%mm3, %%mm7 \n\t"
3741 "pxor %%mm1, %%mm1 \n\t"
3742 "packuswb %%mm7, %%mm1 \n\t"
3743 // step ecx to next set of 8 bytes and repeat loop til done
3744 "addl $8, %%ecx \n\t"
3745 "pand _ActiveMaskEnd, %%mm1 \n\t"
3746 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3747 // Raw(x)
3749 "cmpl _MMXLength, %%ecx \n\t"
3750 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3751 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3752 // mm1 will be used as Raw(x-bpp) next loop
3753 // mm3 ready to be used as Prior(x-bpp) next loop
3754 "jb paeth_3lp \n\t"
3756 : "=S" (dummy_value_S), // output regs (dummy)
3757 "=D" (dummy_value_D)
3759 : "0" (prev_row), // esi // input regs
3760 "1" (row) // edi
3762 : "%ecx" // clobber list
3763 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3764 , "%mm0", "%mm1", "%mm2", "%mm3"
3765 , "%mm4", "%mm5", "%mm6", "%mm7"
3766 #endif
3767 );
3768 }
3769 break; // end 3 bpp
3771 case 6:
3772 //case 7: // GRR BOGUS
3773 //case 5: // GRR BOGUS
3774 {
3775 _ActiveMask.use = 0x00000000ffffffffLL;
3776 _ActiveMask2.use = 0xffffffff00000000LL;
3777 _ShiftBpp.use = bpp << 3; // == bpp * 8
3778 _ShiftRem.use = 64 - _ShiftBpp.use;
3780 __asm__ __volatile__ (
3781 "movl _dif, %%ecx \n\t"
3782 // preload "movl row, %%edi \n\t"
3783 // preload "movl prev_row, %%esi \n\t"
3784 // prime the pump: load the first Raw(x-bpp) data set
3785 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3786 "pxor %%mm0, %%mm0 \n\t"
3788 "paeth_6lp: \n\t"
3789 // must shift to position Raw(x-bpp) data
3790 "psrlq _ShiftRem, %%mm1 \n\t"
3791 // do first set of 4 bytes
3792 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3793 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3794 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3795 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3796 // must shift to position Prior(x-bpp) data
3797 "psrlq _ShiftRem, %%mm3 \n\t"
3798 // pav = p - a = (a + b - c) - a = b - c
3799 "movq %%mm2, %%mm4 \n\t"
3800 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3801 // pbv = p - b = (a + b - c) - b = a - c
3802 "movq %%mm1, %%mm5 \n\t"
3803 "psubw %%mm3, %%mm4 \n\t"
3804 "pxor %%mm7, %%mm7 \n\t"
3805 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3806 "movq %%mm4, %%mm6 \n\t"
3807 "psubw %%mm3, %%mm5 \n\t"
3808 // pa = abs(p-a) = abs(pav)
3809 // pb = abs(p-b) = abs(pbv)
3810 // pc = abs(p-c) = abs(pcv)
3811 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3812 "paddw %%mm5, %%mm6 \n\t"
3813 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3814 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3815 "psubw %%mm0, %%mm4 \n\t"
3816 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3817 "psubw %%mm0, %%mm4 \n\t"
3818 "psubw %%mm7, %%mm5 \n\t"
3819 "pxor %%mm0, %%mm0 \n\t"
3820 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3821 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3822 "psubw %%mm7, %%mm5 \n\t"
3823 "psubw %%mm0, %%mm6 \n\t"
3824 // test pa <= pb
3825 "movq %%mm4, %%mm7 \n\t"
3826 "psubw %%mm0, %%mm6 \n\t"
3827 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3828 "movq %%mm7, %%mm0 \n\t"
3829 // use mm7 mask to merge pa & pb
3830 "pand %%mm7, %%mm5 \n\t"
3831 // use mm0 mask copy to merge a & b
3832 "pand %%mm0, %%mm2 \n\t"
3833 "pandn %%mm4, %%mm7 \n\t"
3834 "pandn %%mm1, %%mm0 \n\t"
3835 "paddw %%mm5, %%mm7 \n\t"
3836 "paddw %%mm2, %%mm0 \n\t"
3837 // test ((pa <= pb)? pa:pb) <= pc
3838 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3839 "pxor %%mm1, %%mm1 \n\t"
3840 "pand %%mm7, %%mm3 \n\t"
3841 "pandn %%mm0, %%mm7 \n\t"
3842 "paddw %%mm3, %%mm7 \n\t"
3843 "pxor %%mm0, %%mm0 \n\t"
3844 "packuswb %%mm1, %%mm7 \n\t"
3845 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3846 "pand _ActiveMask, %%mm7 \n\t"
3847 "psrlq _ShiftRem, %%mm3 \n\t"
3848 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3849 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3850 "movq %%mm2, %%mm6 \n\t"
3851 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3852 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3853 "psllq _ShiftBpp, %%mm6 \n\t"
3854 "movq %%mm7, %%mm5 \n\t"
3855 "psrlq _ShiftRem, %%mm1 \n\t"
3856 "por %%mm6, %%mm3 \n\t"
3857 "psllq _ShiftBpp, %%mm5 \n\t"
3858 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3859 "por %%mm5, %%mm1 \n\t"
3860 // do second set of 4 bytes
3861 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3862 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3863 // pav = p - a = (a + b - c) - a = b - c
3864 "movq %%mm2, %%mm4 \n\t"
3865 // pbv = p - b = (a + b - c) - b = a - c
3866 "movq %%mm1, %%mm5 \n\t"
3867 "psubw %%mm3, %%mm4 \n\t"
3868 "pxor %%mm7, %%mm7 \n\t"
3869 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3870 "movq %%mm4, %%mm6 \n\t"
3871 "psubw %%mm3, %%mm5 \n\t"
3872 // pa = abs(p-a) = abs(pav)
3873 // pb = abs(p-b) = abs(pbv)
3874 // pc = abs(p-c) = abs(pcv)
3875 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3876 "paddw %%mm5, %%mm6 \n\t"
3877 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3878 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3879 "psubw %%mm0, %%mm4 \n\t"
3880 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3881 "psubw %%mm0, %%mm4 \n\t"
3882 "psubw %%mm7, %%mm5 \n\t"
3883 "pxor %%mm0, %%mm0 \n\t"
3884 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3885 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3886 "psubw %%mm7, %%mm5 \n\t"
3887 "psubw %%mm0, %%mm6 \n\t"
3888 // test pa <= pb
3889 "movq %%mm4, %%mm7 \n\t"
3890 "psubw %%mm0, %%mm6 \n\t"
3891 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3892 "movq %%mm7, %%mm0 \n\t"
3893 // use mm7 mask to merge pa & pb
3894 "pand %%mm7, %%mm5 \n\t"
3895 // use mm0 mask copy to merge a & b
3896 "pand %%mm0, %%mm2 \n\t"
3897 "pandn %%mm4, %%mm7 \n\t"
3898 "pandn %%mm1, %%mm0 \n\t"
3899 "paddw %%mm5, %%mm7 \n\t"
3900 "paddw %%mm2, %%mm0 \n\t"
3901 // test ((pa <= pb)? pa:pb) <= pc
3902 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3903 "pxor %%mm1, %%mm1 \n\t"
3904 "pand %%mm7, %%mm3 \n\t"
3905 "pandn %%mm0, %%mm7 \n\t"
3906 "pxor %%mm1, %%mm1 \n\t"
3907 "paddw %%mm3, %%mm7 \n\t"
3908 "pxor %%mm0, %%mm0 \n\t"
3909 // step ecx to next set of 8 bytes and repeat loop til done
3910 "addl $8, %%ecx \n\t"
3911 "packuswb %%mm7, %%mm1 \n\t"
3912 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3913 "cmpl _MMXLength, %%ecx \n\t"
3914 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3915 // mm1 will be used as Raw(x-bpp) next loop
3916 "jb paeth_6lp \n\t"
3918 : "=S" (dummy_value_S), // output regs (dummy)
3919 "=D" (dummy_value_D)
3921 : "0" (prev_row), // esi // input regs
3922 "1" (row) // edi
3924 : "%ecx" // clobber list
3925 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3926 , "%mm0", "%mm1", "%mm2", "%mm3"
3927 , "%mm4", "%mm5", "%mm6", "%mm7"
3928 #endif
3929 );
3930 }
3931 break; // end 6 bpp
3933 case 4:
3934 {
3935 _ActiveMask.use = 0x00000000ffffffffLL;
3937 __asm__ __volatile__ (
3938 "movl _dif, %%ecx \n\t"
3939 // preload "movl row, %%edi \n\t"
3940 // preload "movl prev_row, %%esi \n\t"
3941 "pxor %%mm0, %%mm0 \n\t"
3942 // prime the pump: load the first Raw(x-bpp) data set
3943 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3944 // a=Raw(x-bpp) bytes
3945 "paeth_4lp: \n\t"
3946 // do first set of 4 bytes
3947 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3948 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3949 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3950 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3951 // pav = p - a = (a + b - c) - a = b - c
3952 "movq %%mm2, %%mm4 \n\t"
3953 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3954 // pbv = p - b = (a + b - c) - b = a - c
3955 "movq %%mm1, %%mm5 \n\t"
3956 "psubw %%mm3, %%mm4 \n\t"
3957 "pxor %%mm7, %%mm7 \n\t"
3958 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3959 "movq %%mm4, %%mm6 \n\t"
3960 "psubw %%mm3, %%mm5 \n\t"
3961 // pa = abs(p-a) = abs(pav)
3962 // pb = abs(p-b) = abs(pbv)
3963 // pc = abs(p-c) = abs(pcv)
3964 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3965 "paddw %%mm5, %%mm6 \n\t"
3966 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3967 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3968 "psubw %%mm0, %%mm4 \n\t"
3969 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3970 "psubw %%mm0, %%mm4 \n\t"
3971 "psubw %%mm7, %%mm5 \n\t"
3972 "pxor %%mm0, %%mm0 \n\t"
3973 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3974 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3975 "psubw %%mm7, %%mm5 \n\t"
3976 "psubw %%mm0, %%mm6 \n\t"
3977 // test pa <= pb
3978 "movq %%mm4, %%mm7 \n\t"
3979 "psubw %%mm0, %%mm6 \n\t"
3980 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3981 "movq %%mm7, %%mm0 \n\t"
3982 // use mm7 mask to merge pa & pb
3983 "pand %%mm7, %%mm5 \n\t"
3984 // use mm0 mask copy to merge a & b
3985 "pand %%mm0, %%mm2 \n\t"
3986 "pandn %%mm4, %%mm7 \n\t"
3987 "pandn %%mm1, %%mm0 \n\t"
3988 "paddw %%mm5, %%mm7 \n\t"
3989 "paddw %%mm2, %%mm0 \n\t"
3990 // test ((pa <= pb)? pa:pb) <= pc
3991 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3992 "pxor %%mm1, %%mm1 \n\t"
3993 "pand %%mm7, %%mm3 \n\t"
3994 "pandn %%mm0, %%mm7 \n\t"
3995 "paddw %%mm3, %%mm7 \n\t"
3996 "pxor %%mm0, %%mm0 \n\t"
3997 "packuswb %%mm1, %%mm7 \n\t"
3998 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3999 "pand _ActiveMask, %%mm7 \n\t"
4000 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4001 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4002 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4003 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4004 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4005 // do second set of 4 bytes
4006 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4007 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4008 // pav = p - a = (a + b - c) - a = b - c
4009 "movq %%mm2, %%mm4 \n\t"
4010 // pbv = p - b = (a + b - c) - b = a - c
4011 "movq %%mm1, %%mm5 \n\t"
4012 "psubw %%mm3, %%mm4 \n\t"
4013 "pxor %%mm7, %%mm7 \n\t"
4014 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4015 "movq %%mm4, %%mm6 \n\t"
4016 "psubw %%mm3, %%mm5 \n\t"
4017 // pa = abs(p-a) = abs(pav)
4018 // pb = abs(p-b) = abs(pbv)
4019 // pc = abs(p-c) = abs(pcv)
4020 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4021 "paddw %%mm5, %%mm6 \n\t"
4022 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4023 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4024 "psubw %%mm0, %%mm4 \n\t"
4025 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4026 "psubw %%mm0, %%mm4 \n\t"
4027 "psubw %%mm7, %%mm5 \n\t"
4028 "pxor %%mm0, %%mm0 \n\t"
4029 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4030 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4031 "psubw %%mm7, %%mm5 \n\t"
4032 "psubw %%mm0, %%mm6 \n\t"
4033 // test pa <= pb
4034 "movq %%mm4, %%mm7 \n\t"
4035 "psubw %%mm0, %%mm6 \n\t"
4036 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4037 "movq %%mm7, %%mm0 \n\t"
4038 // use mm7 mask to merge pa & pb
4039 "pand %%mm7, %%mm5 \n\t"
4040 // use mm0 mask copy to merge a & b
4041 "pand %%mm0, %%mm2 \n\t"
4042 "pandn %%mm4, %%mm7 \n\t"
4043 "pandn %%mm1, %%mm0 \n\t"
4044 "paddw %%mm5, %%mm7 \n\t"
4045 "paddw %%mm2, %%mm0 \n\t"
4046 // test ((pa <= pb)? pa:pb) <= pc
4047 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4048 "pxor %%mm1, %%mm1 \n\t"
4049 "pand %%mm7, %%mm3 \n\t"
4050 "pandn %%mm0, %%mm7 \n\t"
4051 "pxor %%mm1, %%mm1 \n\t"
4052 "paddw %%mm3, %%mm7 \n\t"
4053 "pxor %%mm0, %%mm0 \n\t"
4054 // step ecx to next set of 8 bytes and repeat loop til done
4055 "addl $8, %%ecx \n\t"
4056 "packuswb %%mm7, %%mm1 \n\t"
4057 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4058 "cmpl _MMXLength, %%ecx \n\t"
4059 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4060 // mm1 will be used as Raw(x-bpp) next loop
4061 "jb paeth_4lp \n\t"
4063 : "=S" (dummy_value_S), // output regs (dummy)
4064 "=D" (dummy_value_D)
4066 : "0" (prev_row), // esi // input regs
4067 "1" (row) // edi
4069 : "%ecx" // clobber list
4070 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4071 , "%mm0", "%mm1", "%mm2", "%mm3"
4072 , "%mm4", "%mm5", "%mm6", "%mm7"
4073 #endif
4074 );
4075 }
4076 break; // end 4 bpp
4078 case 8: // bpp == 8
4079 {
4080 _ActiveMask.use = 0x00000000ffffffffLL;
4082 __asm__ __volatile__ (
4083 "movl _dif, %%ecx \n\t"
4084 // preload "movl row, %%edi \n\t"
4085 // preload "movl prev_row, %%esi \n\t"
4086 "pxor %%mm0, %%mm0 \n\t"
4087 // prime the pump: load the first Raw(x-bpp) data set
4088 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4089 // a=Raw(x-bpp) bytes
4090 "paeth_8lp: \n\t"
4091 // do first set of 4 bytes
4092 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4093 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4094 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4095 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4096 // pav = p - a = (a + b - c) - a = b - c
4097 "movq %%mm2, %%mm4 \n\t"
4098 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4099 // pbv = p - b = (a + b - c) - b = a - c
4100 "movq %%mm1, %%mm5 \n\t"
4101 "psubw %%mm3, %%mm4 \n\t"
4102 "pxor %%mm7, %%mm7 \n\t"
4103 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4104 "movq %%mm4, %%mm6 \n\t"
4105 "psubw %%mm3, %%mm5 \n\t"
4106 // pa = abs(p-a) = abs(pav)
4107 // pb = abs(p-b) = abs(pbv)
4108 // pc = abs(p-c) = abs(pcv)
4109 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4110 "paddw %%mm5, %%mm6 \n\t"
4111 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4112 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4113 "psubw %%mm0, %%mm4 \n\t"
4114 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4115 "psubw %%mm0, %%mm4 \n\t"
4116 "psubw %%mm7, %%mm5 \n\t"
4117 "pxor %%mm0, %%mm0 \n\t"
4118 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4119 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4120 "psubw %%mm7, %%mm5 \n\t"
4121 "psubw %%mm0, %%mm6 \n\t"
4122 // test pa <= pb
4123 "movq %%mm4, %%mm7 \n\t"
4124 "psubw %%mm0, %%mm6 \n\t"
4125 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4126 "movq %%mm7, %%mm0 \n\t"
4127 // use mm7 mask to merge pa & pb
4128 "pand %%mm7, %%mm5 \n\t"
4129 // use mm0 mask copy to merge a & b
4130 "pand %%mm0, %%mm2 \n\t"
4131 "pandn %%mm4, %%mm7 \n\t"
4132 "pandn %%mm1, %%mm0 \n\t"
4133 "paddw %%mm5, %%mm7 \n\t"
4134 "paddw %%mm2, %%mm0 \n\t"
4135 // test ((pa <= pb)? pa:pb) <= pc
4136 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4137 "pxor %%mm1, %%mm1 \n\t"
4138 "pand %%mm7, %%mm3 \n\t"
4139 "pandn %%mm0, %%mm7 \n\t"
4140 "paddw %%mm3, %%mm7 \n\t"
4141 "pxor %%mm0, %%mm0 \n\t"
4142 "packuswb %%mm1, %%mm7 \n\t"
4143 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4144 "pand _ActiveMask, %%mm7 \n\t"
4145 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4146 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4147 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4148 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4149 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4151 // do second set of 4 bytes
4152 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4153 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4154 // pav = p - a = (a + b - c) - a = b - c
4155 "movq %%mm2, %%mm4 \n\t"
4156 // pbv = p - b = (a + b - c) - b = a - c
4157 "movq %%mm1, %%mm5 \n\t"
4158 "psubw %%mm3, %%mm4 \n\t"
4159 "pxor %%mm7, %%mm7 \n\t"
4160 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4161 "movq %%mm4, %%mm6 \n\t"
4162 "psubw %%mm3, %%mm5 \n\t"
4163 // pa = abs(p-a) = abs(pav)
4164 // pb = abs(p-b) = abs(pbv)
4165 // pc = abs(p-c) = abs(pcv)
4166 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4167 "paddw %%mm5, %%mm6 \n\t"
4168 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4169 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4170 "psubw %%mm0, %%mm4 \n\t"
4171 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4172 "psubw %%mm0, %%mm4 \n\t"
4173 "psubw %%mm7, %%mm5 \n\t"
4174 "pxor %%mm0, %%mm0 \n\t"
4175 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4176 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4177 "psubw %%mm7, %%mm5 \n\t"
4178 "psubw %%mm0, %%mm6 \n\t"
4179 // test pa <= pb
4180 "movq %%mm4, %%mm7 \n\t"
4181 "psubw %%mm0, %%mm6 \n\t"
4182 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4183 "movq %%mm7, %%mm0 \n\t"
4184 // use mm7 mask to merge pa & pb
4185 "pand %%mm7, %%mm5 \n\t"
4186 // use mm0 mask copy to merge a & b
4187 "pand %%mm0, %%mm2 \n\t"
4188 "pandn %%mm4, %%mm7 \n\t"
4189 "pandn %%mm1, %%mm0 \n\t"
4190 "paddw %%mm5, %%mm7 \n\t"
4191 "paddw %%mm2, %%mm0 \n\t"
4192 // test ((pa <= pb)? pa:pb) <= pc
4193 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4194 "pxor %%mm1, %%mm1 \n\t"
4195 "pand %%mm7, %%mm3 \n\t"
4196 "pandn %%mm0, %%mm7 \n\t"
4197 "pxor %%mm1, %%mm1 \n\t"
4198 "paddw %%mm3, %%mm7 \n\t"
4199 "pxor %%mm0, %%mm0 \n\t"
4200 // step ecx to next set of 8 bytes and repeat loop til done
4201 "addl $8, %%ecx \n\t"
4202 "packuswb %%mm7, %%mm1 \n\t"
4203 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4204 "cmpl _MMXLength, %%ecx \n\t"
4205 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4206 // mm1 will be used as Raw(x-bpp) next loop
4207 "jb paeth_8lp \n\t"
4209 : "=S" (dummy_value_S), // output regs (dummy)
4210 "=D" (dummy_value_D)
4212 : "0" (prev_row), // esi // input regs
4213 "1" (row) // edi
4215 : "%ecx" // clobber list
4216 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4217 , "%mm0", "%mm1", "%mm2", "%mm3"
4218 , "%mm4", "%mm5", "%mm6", "%mm7"
4219 #endif
4220 );
4221 }
4222 break; // end 8 bpp
4224 case 1: // bpp = 1
4225 case 2: // bpp = 2
4226 default: // bpp > 8
4227 {
4228 __asm__ __volatile__ (
4229 #ifdef __PIC__
4230 "pushl %%ebx \n\t" // save Global Offset Table index
4231 #endif
4232 "movl _dif, %%ebx \n\t"
4233 "cmpl _FullLength, %%ebx \n\t"
4234 "jnb paeth_dend \n\t"
4236 // preload "movl row, %%edi \n\t"
4237 // preload "movl prev_row, %%esi \n\t"
4238 // do Paeth decode for remaining bytes
4239 "movl %%ebx, %%edx \n\t"
4240 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4241 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4242 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4244 "paeth_dlp: \n\t"
4245 "xorl %%eax, %%eax \n\t"
4246 // pav = p - a = (a + b - c) - a = b - c
4247 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4248 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4249 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4250 "movl %%eax, _patemp \n\t" // Save pav for later use
4251 "xorl %%eax, %%eax \n\t"
4252 // pbv = p - b = (a + b - c) - b = a - c
4253 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4254 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4255 "movl %%eax, %%ecx \n\t"
4256 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4257 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4258 // pc = abs(pcv)
4259 "testl $0x80000000, %%eax \n\t"
4260 "jz paeth_dpca \n\t"
4261 "negl %%eax \n\t" // reverse sign of neg values
4263 "paeth_dpca: \n\t"
4264 "movl %%eax, _pctemp \n\t" // save pc for later use
4265 // pb = abs(pbv)
4266 "testl $0x80000000, %%ecx \n\t"
4267 "jz paeth_dpba \n\t"
4268 "negl %%ecx \n\t" // reverse sign of neg values
4270 "paeth_dpba: \n\t"
4271 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4272 // pa = abs(pav)
4273 "movl _patemp, %%eax \n\t"
4274 "testl $0x80000000, %%eax \n\t"
4275 "jz paeth_dpaa \n\t"
4276 "negl %%eax \n\t" // reverse sign of neg values
4278 "paeth_dpaa: \n\t"
4279 "movl %%eax, _patemp \n\t" // save pa for later use
4280 // test if pa <= pb
4281 "cmpl %%ecx, %%eax \n\t"
4282 "jna paeth_dabb \n\t"
4283 // pa > pb; now test if pb <= pc
4284 "cmpl _pctemp, %%ecx \n\t"
4285 "jna paeth_dbbc \n\t"
4286 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4287 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4288 "jmp paeth_dpaeth \n\t"
4290 "paeth_dbbc: \n\t"
4291 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4292 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4293 "jmp paeth_dpaeth \n\t"
4295 "paeth_dabb: \n\t"
4296 // pa <= pb; now test if pa <= pc
4297 "cmpl _pctemp, %%eax \n\t"
4298 "jna paeth_dabc \n\t"
4299 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4300 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4301 "jmp paeth_dpaeth \n\t"
4303 "paeth_dabc: \n\t"
4304 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4305 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4307 "paeth_dpaeth: \n\t"
4308 "incl %%ebx \n\t"
4309 "incl %%edx \n\t"
4310 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4311 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4312 "cmpl _FullLength, %%ebx \n\t"
4313 "jb paeth_dlp \n\t"
4315 "paeth_dend: \n\t"
4316 #ifdef __PIC__
4317 "popl %%ebx \n\t" // index to Global Offset Table
4318 #endif
4320 : "=c" (dummy_value_c), // output regs (dummy)
4321 "=S" (dummy_value_S),
4322 "=D" (dummy_value_D)
4324 : "0" (bpp), // ecx // input regs
4325 "1" (prev_row), // esi
4326 "2" (row) // edi
4328 : "%eax", "%edx" // clobber list
4329 #ifndef __PIC__
4330 , "%ebx"
4331 #endif
4332 );
4333 }
4334 return; // No need to go further with this one
4336 } // end switch (bpp)
4338 __asm__ __volatile__ (
4339 // MMX acceleration complete; now do clean-up
4340 // check if any remaining bytes left to decode
4341 #ifdef __PIC__
4342 "pushl %%ebx \n\t" // save index to Global Offset Table
4343 #endif
4344 "movl _MMXLength, %%ebx \n\t"
4345 "cmpl _FullLength, %%ebx \n\t"
4346 "jnb paeth_end \n\t"
4347 //pre "movl row, %%edi \n\t"
4348 //pre "movl prev_row, %%esi \n\t"
4349 // do Paeth decode for remaining bytes
4350 "movl %%ebx, %%edx \n\t"
4351 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4352 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4353 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4355 "paeth_lp2: \n\t"
4356 "xorl %%eax, %%eax \n\t"
4357 // pav = p - a = (a + b - c) - a = b - c
4358 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4359 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4360 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4361 "movl %%eax, _patemp \n\t" // Save pav for later use
4362 "xorl %%eax, %%eax \n\t"
4363 // pbv = p - b = (a + b - c) - b = a - c
4364 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4365 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4366 "movl %%eax, %%ecx \n\t"
4367 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4368 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4369 // pc = abs(pcv)
4370 "testl $0x80000000, %%eax \n\t"
4371 "jz paeth_pca2 \n\t"
4372 "negl %%eax \n\t" // reverse sign of neg values
4374 "paeth_pca2: \n\t"
4375 "movl %%eax, _pctemp \n\t" // save pc for later use
4376 // pb = abs(pbv)
4377 "testl $0x80000000, %%ecx \n\t"
4378 "jz paeth_pba2 \n\t"
4379 "negl %%ecx \n\t" // reverse sign of neg values
4381 "paeth_pba2: \n\t"
4382 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4383 // pa = abs(pav)
4384 "movl _patemp, %%eax \n\t"
4385 "testl $0x80000000, %%eax \n\t"
4386 "jz paeth_paa2 \n\t"
4387 "negl %%eax \n\t" // reverse sign of neg values
4389 "paeth_paa2: \n\t"
4390 "movl %%eax, _patemp \n\t" // save pa for later use
4391 // test if pa <= pb
4392 "cmpl %%ecx, %%eax \n\t"
4393 "jna paeth_abb2 \n\t"
4394 // pa > pb; now test if pb <= pc
4395 "cmpl _pctemp, %%ecx \n\t"
4396 "jna paeth_bbc2 \n\t"
4397 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4398 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4399 "jmp paeth_paeth2 \n\t"
4401 "paeth_bbc2: \n\t"
4402 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4403 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4404 "jmp paeth_paeth2 \n\t"
4406 "paeth_abb2: \n\t"
4407 // pa <= pb; now test if pa <= pc
4408 "cmpl _pctemp, %%eax \n\t"
4409 "jna paeth_abc2 \n\t"
4410 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4411 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4412 "jmp paeth_paeth2 \n\t"
4414 "paeth_abc2: \n\t"
4415 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4416 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4418 "paeth_paeth2: \n\t"
4419 "incl %%ebx \n\t"
4420 "incl %%edx \n\t"
4421 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4422 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4423 "cmpl _FullLength, %%ebx \n\t"
4424 "jb paeth_lp2 \n\t"
4426 "paeth_end: \n\t"
4427 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4428 #ifdef __PIC__
4429 "popl %%ebx \n\t" // restore index to Global Offset Table
4430 #endif
4432 : "=c" (dummy_value_c), // output regs (dummy)
4433 "=S" (dummy_value_S),
4434 "=D" (dummy_value_D)
4436 : "0" (bpp), // ecx // input regs
4437 "1" (prev_row), // esi
4438 "2" (row) // edi
4440 : "%eax", "%edx" // clobber list (no input regs!)
4441 #ifndef __PIC__
4442 , "%ebx"
4443 #endif
4444 );
4446 } /* end png_read_filter_row_mmx_paeth() */
4447 #endif
4452 #ifdef PNG_THREAD_UNSAFE_OK
4453 //===========================================================================//
4454 // //
4455 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4456 // //
4457 //===========================================================================//
4459 // Optimized code for PNG Sub filter decoder
4461 static void /* PRIVATE */
4462 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4463 {
4464 int bpp;
4465 int dummy_value_a;
4466 int dummy_value_D;
4468 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4469 _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4471 __asm__ __volatile__ (
4472 //pre "movl row, %%edi \n\t"
4473 "movl %%edi, %%esi \n\t" // lp = row
4474 //pre "movl bpp, %%eax \n\t"
4475 "addl %%eax, %%edi \n\t" // rp = row + bpp
4476 //irr "xorl %%eax, %%eax \n\t"
4477 // get # of bytes to alignment
4478 "movl %%edi, _dif \n\t" // take start of row
4479 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4480 // alignment boundary
4481 "xorl %%ecx, %%ecx \n\t"
4482 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4483 "subl %%edi, _dif \n\t" // subtract from start ==> value
4484 "jz sub_go \n\t" // ecx at alignment
4486 "sub_lp1: \n\t" // fix alignment
4487 "movb (%%esi,%%ecx,), %%al \n\t"
4488 "addb %%al, (%%edi,%%ecx,) \n\t"
4489 "incl %%ecx \n\t"
4490 "cmpl _dif, %%ecx \n\t"
4491 "jb sub_lp1 \n\t"
4493 "sub_go: \n\t"
4494 "movl _FullLength, %%eax \n\t"
4495 "movl %%eax, %%edx \n\t"
4496 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4497 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4498 "subl %%edx, %%eax \n\t" // drop over bytes from length
4499 "movl %%eax, _MMXLength \n\t"
4501 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4502 "=D" (dummy_value_D) // 1
4504 : "0" (bpp), // eax // input regs
4505 "1" (row) // edi
4507 : "%ebx", "%ecx", "%edx" // clobber list
4508 , "%esi"
4510 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4511 , "%mm0", "%mm1", "%mm2", "%mm3"
4512 , "%mm4", "%mm5", "%mm6", "%mm7"
4513 #endif
4514 );
4516 // now do the math for the rest of the row
4517 switch (bpp)
4518 {
4519 case 3:
4520 {
4521 _ActiveMask.use = 0x0000ffffff000000LL;
4522 _ShiftBpp.use = 24; // == 3 * 8
4523 _ShiftRem.use = 40; // == 64 - 24
4525 __asm__ __volatile__ (
4526 // preload "movl row, %%edi \n\t"
4527 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4528 // active byte group
4529 "movl %%edi, %%esi \n\t" // lp = row
4530 // preload "movl bpp, %%eax \n\t"
4531 "addl %%eax, %%edi \n\t" // rp = row + bpp
4532 "movq %%mm7, %%mm6 \n\t"
4533 "movl _dif, %%edx \n\t"
4534 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4535 // 3rd active byte group
4536 // prime the pump: load the first Raw(x-bpp) data set
4537 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4539 "sub_3lp: \n\t" // shift data for adding first
4540 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4541 // shift clears inactive bytes)
4542 // add 1st active group
4543 "movq (%%edi,%%edx,), %%mm0 \n\t"
4544 "paddb %%mm1, %%mm0 \n\t"
4546 // add 2nd active group
4547 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4548 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4549 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4550 "paddb %%mm1, %%mm0 \n\t"
4552 // add 3rd active group
4553 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4554 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4555 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4556 "addl $8, %%edx \n\t"
4557 "paddb %%mm1, %%mm0 \n\t"
4559 "cmpl _MMXLength, %%edx \n\t"
4560 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4561 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4562 "jb sub_3lp \n\t"
4564 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4565 "=D" (dummy_value_D) // 1
4567 : "0" (bpp), // eax // input regs
4568 "1" (row) // edi
4570 : "%edx", "%esi" // clobber list
4571 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4572 , "%mm0", "%mm1", "%mm6", "%mm7"
4573 #endif
4574 );
4575 }
4576 break;
4578 case 1:
4579 {
4580 __asm__ __volatile__ (
4581 "movl _dif, %%edx \n\t"
4582 // preload "movl row, %%edi \n\t"
4583 "cmpl _FullLength, %%edx \n\t"
4584 "jnb sub_1end \n\t"
4585 "movl %%edi, %%esi \n\t" // lp = row
4586 "xorl %%eax, %%eax \n\t"
4587 // preload "movl bpp, %%eax \n\t"
4588 "addl %%eax, %%edi \n\t" // rp = row + bpp
4590 "sub_1lp: \n\t"
4591 "movb (%%esi,%%edx,), %%al \n\t"
4592 "addb %%al, (%%edi,%%edx,) \n\t"
4593 "incl %%edx \n\t"
4594 "cmpl _FullLength, %%edx \n\t"
4595 "jb sub_1lp \n\t"
4597 "sub_1end: \n\t"
4599 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4600 "=D" (dummy_value_D) // 1
4602 : "0" (bpp), // eax // input regs
4603 "1" (row) // edi
4605 : "%edx", "%esi" // clobber list
4606 );
4607 }
4608 return;
4610 case 6:
4611 case 4:
4612 //case 7: // GRR BOGUS
4613 //case 5: // GRR BOGUS
4614 {
4615 _ShiftBpp.use = bpp << 3;
4616 _ShiftRem.use = 64 - _ShiftBpp.use;
4618 __asm__ __volatile__ (
4619 // preload "movl row, %%edi \n\t"
4620 "movl _dif, %%edx \n\t"
4621 "movl %%edi, %%esi \n\t" // lp = row
4622 // preload "movl bpp, %%eax \n\t"
4623 "addl %%eax, %%edi \n\t" // rp = row + bpp
4625 // prime the pump: load the first Raw(x-bpp) data set
4626 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4628 "sub_4lp: \n\t" // shift data for adding first
4629 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4630 // shift clears inactive bytes)
4631 "movq (%%edi,%%edx,), %%mm0 \n\t"
4632 "paddb %%mm1, %%mm0 \n\t"
4634 // add 2nd active group
4635 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4636 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4637 "addl $8, %%edx \n\t"
4638 "paddb %%mm1, %%mm0 \n\t"
4640 "cmpl _MMXLength, %%edx \n\t"
4641 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4642 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4643 "jb sub_4lp \n\t"
4645 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4646 "=D" (dummy_value_D) // 1
4648 : "0" (bpp), // eax // input regs
4649 "1" (row) // edi
4651 : "%edx", "%esi" // clobber list
4652 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4653 , "%mm0", "%mm1"
4654 #endif
4655 );
4656 }
4657 break;
4659 case 2:
4660 {
4661 _ActiveMask.use = 0x00000000ffff0000LL;
4662 _ShiftBpp.use = 16; // == 2 * 8
4663 _ShiftRem.use = 48; // == 64 - 16
4665 __asm__ __volatile__ (
4666 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4667 // active byte group
4668 "movl _dif, %%edx \n\t"
4669 "movq %%mm7, %%mm6 \n\t"
4670 // preload "movl row, %%edi \n\t"
4671 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4672 // 3rd active byte group
4673 "movl %%edi, %%esi \n\t" // lp = row
4674 "movq %%mm6, %%mm5 \n\t"
4675 // preload "movl bpp, %%eax \n\t"
4676 "addl %%eax, %%edi \n\t" // rp = row + bpp
4677 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4678 // 4th active byte group
4679 // prime the pump: load the first Raw(x-bpp) data set
4680 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4682 "sub_2lp: \n\t" // shift data for adding first
4683 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4684 // shift clears inactive bytes)
4685 // add 1st active group
4686 "movq (%%edi,%%edx,), %%mm0 \n\t"
4687 "paddb %%mm1, %%mm0 \n\t"
4689 // add 2nd active group
4690 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4691 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4692 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4693 "paddb %%mm1, %%mm0 \n\t"
4695 // add 3rd active group
4696 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4697 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4698 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4699 "paddb %%mm1, %%mm0 \n\t"
4701 // add 4th active group
4702 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4703 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4704 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4705 "addl $8, %%edx \n\t"
4706 "paddb %%mm1, %%mm0 \n\t"
4707 "cmpl _MMXLength, %%edx \n\t"
4708 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4709 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4710 "jb sub_2lp \n\t"
4712 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4713 "=D" (dummy_value_D) // 1
4715 : "0" (bpp), // eax // input regs
4716 "1" (row) // edi
4718 : "%edx", "%esi" // clobber list
4719 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4720 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4721 #endif
4722 );
4723 }
4724 break;
4726 case 8:
4727 {
4728 __asm__ __volatile__ (
4729 // preload "movl row, %%edi \n\t"
4730 "movl _dif, %%edx \n\t"
4731 "movl %%edi, %%esi \n\t" // lp = row
4732 // preload "movl bpp, %%eax \n\t"
4733 "addl %%eax, %%edi \n\t" // rp = row + bpp
4734 "movl _MMXLength, %%ecx \n\t"
4736 // prime the pump: load the first Raw(x-bpp) data set
4737 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4738 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4740 "sub_8lp: \n\t"
4741 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4742 "paddb %%mm7, %%mm0 \n\t"
4743 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4744 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4746 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4747 // This will be repeated for each group of 8 bytes with the 8th
4748 // group being used as the Raw(x-bpp) for the 1st group of the
4749 // next loop.
4751 "paddb %%mm0, %%mm1 \n\t"
4752 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4753 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4754 "paddb %%mm1, %%mm2 \n\t"
4755 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4756 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4757 "paddb %%mm2, %%mm3 \n\t"
4758 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4759 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4760 "paddb %%mm3, %%mm4 \n\t"
4761 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4762 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4763 "paddb %%mm4, %%mm5 \n\t"
4764 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4765 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4766 "paddb %%mm5, %%mm6 \n\t"
4767 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4768 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4769 "addl $64, %%edx \n\t"
4770 "paddb %%mm6, %%mm7 \n\t"
4771 "cmpl %%ecx, %%edx \n\t"
4772 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4773 "jb sub_8lp \n\t"
4775 "cmpl _MMXLength, %%edx \n\t"
4776 "jnb sub_8lt8 \n\t"
4778 "sub_8lpA: \n\t"
4779 "movq (%%edi,%%edx,), %%mm0 \n\t"
4780 "addl $8, %%edx \n\t"
4781 "paddb %%mm7, %%mm0 \n\t"
4782 "cmpl _MMXLength, %%edx \n\t"
4783 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4784 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4785 // to mm1 to be new Raw(x-bpp)
4786 // for next loop
4787 "jb sub_8lpA \n\t"
4789 "sub_8lt8: \n\t"
4791 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4792 "=D" (dummy_value_D) // 1
4794 : "0" (bpp), // eax // input regs
4795 "1" (row) // edi
4797 : "%ecx", "%edx", "%esi" // clobber list
4798 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4799 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4800 #endif
4801 );
4802 }
4803 break;
4805 default: // bpp greater than 8 bytes GRR BOGUS
4806 {
4807 __asm__ __volatile__ (
4808 "movl _dif, %%edx \n\t"
4809 // preload "movl row, %%edi \n\t"
4810 "movl %%edi, %%esi \n\t" // lp = row
4811 // preload "movl bpp, %%eax \n\t"
4812 "addl %%eax, %%edi \n\t" // rp = row + bpp
4814 "sub_Alp: \n\t"
4815 "movq (%%edi,%%edx,), %%mm0 \n\t"
4816 "movq (%%esi,%%edx,), %%mm1 \n\t"
4817 "addl $8, %%edx \n\t"
4818 "paddb %%mm1, %%mm0 \n\t"
4819 "cmpl _MMXLength, %%edx \n\t"
4820 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4821 // -8 to offset addl edx
4822 "jb sub_Alp \n\t"
4824 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4825 "=D" (dummy_value_D) // 1
4827 : "0" (bpp), // eax // input regs
4828 "1" (row) // edi
4830 : "%edx", "%esi" // clobber list
4831 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832 , "%mm0", "%mm1"
4833 #endif
4834 );
4835 }
4836 break;
4838 } // end switch (bpp)
4840 __asm__ __volatile__ (
4841 "movl _MMXLength, %%edx \n\t"
4842 //pre "movl row, %%edi \n\t"
4843 "cmpl _FullLength, %%edx \n\t"
4844 "jnb sub_end \n\t"
4846 "movl %%edi, %%esi \n\t" // lp = row
4847 //pre "movl bpp, %%eax \n\t"
4848 "addl %%eax, %%edi \n\t" // rp = row + bpp
4849 "xorl %%eax, %%eax \n\t"
4851 "sub_lp2: \n\t"
4852 "movb (%%esi,%%edx,), %%al \n\t"
4853 "addb %%al, (%%edi,%%edx,) \n\t"
4854 "incl %%edx \n\t"
4855 "cmpl _FullLength, %%edx \n\t"
4856 "jb sub_lp2 \n\t"
4858 "sub_end: \n\t"
4859 "EMMS \n\t" // end MMX instructions
4861 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4862 "=D" (dummy_value_D) // 1
4864 : "0" (bpp), // eax // input regs
4865 "1" (row) // edi
4867 : "%edx", "%esi" // clobber list
4868 );
4870 } // end of png_read_filter_row_mmx_sub()
4871 #endif
4876 //===========================================================================//
4877 // //
4878 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4879 // //
4880 //===========================================================================//
4882 // Optimized code for PNG Up filter decoder
4884 static void /* PRIVATE */
4885 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4886 png_bytep prev_row)
4887 {
4888 png_uint_32 len;
4889 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4890 int dummy_value_S;
4891 int dummy_value_D;
4893 len = row_info->rowbytes; // number of bytes to filter
4895 __asm__ __volatile__ (
4896 //pre "movl row, %%edi \n\t"
4897 // get # of bytes to alignment
4898 "movl %%edi, %%ecx \n\t"
4899 "xorl %%ebx, %%ebx \n\t"
4900 "addl $0x7, %%ecx \n\t"
4901 "xorl %%eax, %%eax \n\t"
4902 "andl $0xfffffff8, %%ecx \n\t"
4903 //pre "movl prev_row, %%esi \n\t"
4904 "subl %%edi, %%ecx \n\t"
4905 "jz up_go \n\t"
4907 "up_lp1: \n\t" // fix alignment
4908 "movb (%%edi,%%ebx,), %%al \n\t"
4909 "addb (%%esi,%%ebx,), %%al \n\t"
4910 "incl %%ebx \n\t"
4911 "cmpl %%ecx, %%ebx \n\t"
4912 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4913 "jb up_lp1 \n\t" // offset incl ebx
4915 "up_go: \n\t"
4916 //pre "movl len, %%edx \n\t"
4917 "movl %%edx, %%ecx \n\t"
4918 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4919 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4920 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4922 // unrolled loop - use all MMX registers and interleave to reduce
4923 // number of branch instructions (loops) and reduce partial stalls
4924 "up_loop: \n\t"
4925 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4926 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4927 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4928 "paddb %%mm1, %%mm0 \n\t"
4929 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4930 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4931 "paddb %%mm3, %%mm2 \n\t"
4932 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4933 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4934 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4935 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4936 "paddb %%mm5, %%mm4 \n\t"
4937 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4938 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4939 "paddb %%mm7, %%mm6 \n\t"
4940 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4941 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4942 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4943 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4944 "paddb %%mm1, %%mm0 \n\t"
4945 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4946 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4947 "paddb %%mm3, %%mm2 \n\t"
4948 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4949 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4950 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4951 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4952 "paddb %%mm5, %%mm4 \n\t"
4953 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4954 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4955 "addl $64, %%ebx \n\t"
4956 "paddb %%mm7, %%mm6 \n\t"
4957 "cmpl %%ecx, %%ebx \n\t"
4958 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4959 "jb up_loop \n\t" // -8 to offset addl ebx
4961 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
4962 "jz up_end \n\t"
4964 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
4965 "jb up_lt8 \n\t" // [added by lcreeve@netins.net]
4967 "addl %%edx, %%ecx \n\t"
4968 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4969 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4970 "jz up_lt8 \n\t"
4972 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
4973 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4974 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4975 "addl $8, %%ebx \n\t"
4976 "paddb %%mm1, %%mm0 \n\t"
4977 "cmpl %%ecx, %%ebx \n\t"
4978 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4979 "jb up_lpA \n\t" // offset add ebx
4980 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
4981 "jz up_end \n\t"
4983 "up_lt8: \n\t"
4984 "xorl %%eax, %%eax \n\t"
4985 "addl %%edx, %%ecx \n\t" // move over byte count into counter
4987 "up_lp2: \n\t" // use x86 regs for remaining bytes
4988 "movb (%%edi,%%ebx,), %%al \n\t"
4989 "addb (%%esi,%%ebx,), %%al \n\t"
4990 "incl %%ebx \n\t"
4991 "cmpl %%ecx, %%ebx \n\t"
4992 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4993 "jb up_lp2 \n\t" // offset inc ebx
4995 "up_end: \n\t"
4996 "EMMS \n\t" // conversion of filtered row complete
4998 : "=d" (dummy_value_d), // 0 // output regs (dummy)
4999 "=S" (dummy_value_S), // 1
5000 "=D" (dummy_value_D) // 2
5002 : "0" (len), // edx // input regs
5003 "1" (prev_row), // esi
5004 "2" (row) // edi
5006 : "%eax", "%ebx", "%ecx" // clobber list (no input regs!)
5008 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5009 , "%mm0", "%mm1", "%mm2", "%mm3"
5010 , "%mm4", "%mm5", "%mm6", "%mm7"
5011 #endif
5012 );
5014 } // end of png_read_filter_row_mmx_up()
5016 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5021 /*===========================================================================*/
5022 /* */
5023 /* P N G _ R E A D _ F I L T E R _ R O W */
5024 /* */
5025 /*===========================================================================*/
5028 /* Optimized png_read_filter_row routines */
5030 void /* PRIVATE */
5031 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5032 row, png_bytep prev_row, int filter)
5033 {
5034 #ifdef PNG_DEBUG
5035 char filnm[10];
5036 #endif
5038 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5039 /* GRR: these are superseded by png_ptr->asm_flags: */
5040 #define UseMMX_sub 1 // GRR: converted 20000730
5041 #define UseMMX_up 1 // GRR: converted 20000729
5042 #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5043 #define UseMMX_paeth 1 // GRR: converted 20000828
5045 if (_mmx_supported == 2) {
5046 /* this should have happened in png_init_mmx_flags() already */
5047 png_warning(png_ptr, "asm_flags may not have been initialized");
5048 png_mmx_support();
5049 }
5050 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5052 #ifdef PNG_DEBUG
5053 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5054 switch (filter)
5055 {
5056 case 0: sprintf(filnm, "none");
5057 break;
5058 case 1: sprintf(filnm, "sub-%s",
5059 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5060 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5061 #endif
5062 "x86");
5063 break;
5064 case 2: sprintf(filnm, "up-%s",
5065 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5066 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5067 #endif
5068 "x86");
5069 break;
5070 case 3: sprintf(filnm, "avg-%s",
5071 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5072 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5073 #endif
5074 "x86");
5075 break;
5076 case 4: sprintf(filnm, "Paeth-%s",
5077 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5078 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5079 #endif
5080 "x86");
5081 break;
5082 default: sprintf(filnm, "unknw");
5083 break;
5084 }
5085 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5086 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5087 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5088 (int)((row_info->pixel_depth + 7) >> 3));
5089 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5090 #endif /* PNG_DEBUG */
5092 switch (filter)
5093 {
5094 case PNG_FILTER_VALUE_NONE:
5095 break;
5097 case PNG_FILTER_VALUE_SUB:
5098 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5099 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5100 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5101 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5102 {
5103 png_read_filter_row_mmx_sub(row_info, row);
5104 }
5105 else
5106 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5107 {
5108 png_uint_32 i;
5109 png_uint_32 istop = row_info->rowbytes;
5110 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5111 png_bytep rp = row + bpp;
5112 png_bytep lp = row;
5114 for (i = bpp; i < istop; i++)
5115 {
5116 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5117 rp++;
5118 }
5119 } /* end !UseMMX_sub */
5120 break;
5122 case PNG_FILTER_VALUE_UP:
5123 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5124 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5125 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5126 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5127 {
5128 png_read_filter_row_mmx_up(row_info, row, prev_row);
5129 }
5130 else
5131 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5132 {
5133 png_uint_32 i;
5134 png_uint_32 istop = row_info->rowbytes;
5135 png_bytep rp = row;
5136 png_bytep pp = prev_row;
5138 for (i = 0; i < istop; ++i)
5139 {
5140 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5141 rp++;
5142 }
5143 } /* end !UseMMX_up */
5144 break;
5146 case PNG_FILTER_VALUE_AVG:
5147 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5148 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5149 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5150 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5151 {
5152 png_read_filter_row_mmx_avg(row_info, row, prev_row);
5153 }
5154 else
5155 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5156 {
5157 png_uint_32 i;
5158 png_bytep rp = row;
5159 png_bytep pp = prev_row;
5160 png_bytep lp = row;
5161 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5162 png_uint_32 istop = row_info->rowbytes - bpp;
5164 for (i = 0; i < bpp; i++)
5165 {
5166 *rp = (png_byte)(((int)(*rp) +
5167 ((int)(*pp++) >> 1)) & 0xff);
5168 rp++;
5169 }
5171 for (i = 0; i < istop; i++)
5172 {
5173 *rp = (png_byte)(((int)(*rp) +
5174 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5175 rp++;
5176 }
5177 } /* end !UseMMX_avg */
5178 break;
5180 case PNG_FILTER_VALUE_PAETH:
5181 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5182 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5183 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5184 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5185 {
5186 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5187 }
5188 else
5189 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5190 {
5191 png_uint_32 i;
5192 png_bytep rp = row;
5193 png_bytep pp = prev_row;
5194 png_bytep lp = row;
5195 png_bytep cp = prev_row;
5196 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5197 png_uint_32 istop = row_info->rowbytes - bpp;
5199 for (i = 0; i < bpp; i++)
5200 {
5201 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5202 rp++;
5203 }
5205 for (i = 0; i < istop; i++) /* use leftover rp,pp */
5206 {
5207 int a, b, c, pa, pb, pc, p;
5209 a = *lp++;
5210 b = *pp++;
5211 c = *cp++;
5213 p = b - c;
5214 pc = a - c;
5216 #ifdef PNG_USE_ABS
5217 pa = abs(p);
5218 pb = abs(pc);
5219 pc = abs(p + pc);
5220 #else
5221 pa = p < 0 ? -p : p;
5222 pb = pc < 0 ? -pc : pc;
5223 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5224 #endif
5226 /*
5227 if (pa <= pb && pa <= pc)
5228 p = a;
5229 else if (pb <= pc)
5230 p = b;
5231 else
5232 p = c;
5233 */
5235 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5237 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5238 rp++;
5239 }
5240 } /* end !UseMMX_paeth */
5241 break;
5243 default:
5244 png_warning(png_ptr, "Ignoring bad row-filter type");
5245 *row=0;
5246 break;
5247 }
5248 }
5250 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5253 /*===========================================================================*/
5254 /* */
5255 /* P N G _ M M X _ S U P P O R T */
5256 /* */
5257 /*===========================================================================*/
5259 /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5260 * (2) all instructions compile with gcc 2.7.2.3 and later
5261 * (3) the function is moved down here to prevent gcc from
5262 * inlining it in multiple places and then barfing be-
5263 * cause the ".NOT_SUPPORTED" label is multiply defined
5264 * [is there a way to signal that a *single* function should
5265 * not be inlined? is there a way to modify the label for
5266 * each inlined instance, e.g., by appending _1, _2, etc.?
5267 * maybe if don't use leading "." in label name? (nope...sigh)]
5268 */
5270 int PNGAPI
5271 png_mmx_support(void)
5272 {
5273 #if defined(PNG_MMX_CODE_SUPPORTED)
5274 __asm__ __volatile__ (
5275 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5276 "pushl %%ecx \n\t" // so does ecx...
5277 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5278 // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5279 // "pushf \n\t" // 16-bit pushf
5280 "pushfl \n\t" // save Eflag to stack
5281 "popl %%eax \n\t" // get Eflag from stack into eax
5282 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5283 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5284 "pushl %%eax \n\t" // save modified Eflag back to stack
5285 // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5286 // "popf \n\t" // 16-bit popf
5287 "popfl \n\t" // restore modified value to Eflag reg
5288 "pushfl \n\t" // save Eflag to stack
5289 "popl %%eax \n\t" // get Eflag from stack
5290 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5291 "jz .NOT_SUPPORTED \n\t" // if same, CPUID instr. is not supported
5293 "xorl %%eax, %%eax \n\t" // set eax to zero
5294 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5295 "cpuid \n\t" // get the CPU identification info
5296 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5297 "jl .NOT_SUPPORTED \n\t" // if eax is zero, MMX is not supported
5299 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5300 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5301 // faster than the instruction "mov eax, 1"
5302 "cpuid \n\t" // get the CPU identification info again
5303 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5304 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5305 "jz .NOT_SUPPORTED \n\t" // non-zero = yes, MMX IS supported
5307 "movl $1, %%eax \n\t" // set return value to 1
5308 "jmp .RETURN \n\t" // DONE: have MMX support
5310 ".NOT_SUPPORTED: \n\t" // target label for jump instructions
5311 "movl $0, %%eax \n\t" // set return value to 0
5312 ".RETURN: \n\t" // target label for jump instructions
5313 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5314 "popl %%edx \n\t" // restore edx
5315 "popl %%ecx \n\t" // restore ecx
5316 "popl %%ebx \n\t" // restore ebx
5318 // "ret \n\t" // DONE: no MMX support
5319 // (fall through to standard C "ret")
5321 : // output list (none)
5323 : // any variables used on input (none)
5325 : "%eax" // clobber list
5326 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5327 // , "memory" // if write to a variable gcc thought was in a reg
5328 // , "cc" // "condition codes" (flag bits)
5329 );
5330 #else
5331 _mmx_supported = 0;
5332 #endif /* PNG_MMX_CODE_SUPPORTED */
5334 return _mmx_supported;
5335 }
5338 #endif /* PNG_USE_PNGGCCRD */