1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
5 * libpng 1.2.0 - September 1, 2001
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 *
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
19 *
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21 *
22 * [runtime MMX configuration, GRR 20010102]
23 *
24 */
26 #define PNG_INTERNAL
27 #include "png.h"
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31 static int mmx_supported=2;
34 int PNGAPI
35 png_mmx_support(void)
36 {
37 int mmx_supported_local = 0;
38 _asm {
39 push ebx //CPUID will trash these
40 push ecx
41 push edx
42 pushfd //Save Eflag to stack
43 pop eax //Get Eflag from stack into eax
44 mov ecx, eax //Make another copy of Eflag in ecx
45 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
46 push eax //Save modified Eflag back to stack
48 popfd //Restored modified value back to Eflag reg
49 pushfd //Save Eflag to stack
50 pop eax //Get Eflag from stack
51 xor eax, ecx //Compare the new Eflag with the original Eflag
52 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
53 //skip following instructions and jump to
54 //NOT_SUPPORTED label
56 xor eax, eax //Set eax to zero
58 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
59 _asm _emit 0xa2
61 cmp eax, 1 //make sure eax return non-zero value
62 jl NOT_SUPPORTED //If eax is zero, mmx not supported
64 xor eax, eax //set eax to zero
65 inc eax //Now increment eax to 1. This instruction is
66 //faster than the instruction "mov eax, 1"
68 _asm _emit 0x0f //CPUID instruction
69 _asm _emit 0xa2
71 and edx, 0x00800000 //mask out all bits but mmx bit(24)
72 cmp edx, 0 // 0 = mmx not supported
73 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
75 mov mmx_supported_local, 1 //set return value to 1
77 NOT_SUPPORTED:
78 mov eax, mmx_supported_local //move return value to eax
79 pop edx //CPUID trashed these
80 pop ecx
81 pop ebx
82 }
84 //mmx_supported_local=0; // test code for force don't support MMX
85 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
87 mmx_supported = mmx_supported_local;
88 return mmx_supported_local;
89 }
91 /* Combines the row recently read in with the previous row.
92 This routine takes care of alpha and transparency if requested.
93 This routine also handles the two methods of progressive display
94 of interlaced images, depending on the mask value.
95 The mask value describes which pixels are to be combined with
96 the row. The pattern always repeats every 8 pixels, so just 8
97 bits are needed. A one indicates the pixel is to be combined; a
98 zero indicates the pixel is to be skipped. This is in addition
99 to any alpha or transparency value associated with the pixel. If
100 you want all pixels to be combined, pass 0xff (255) in mask. */
102 /* Use this routine for x86 platform - uses faster MMX routine if machine
103 supports MMX */
105 void /* PRIVATE */
106 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
107 {
108 #ifdef PNG_USE_LOCAL_ARRAYS
109 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
110 #endif
112 png_debug(1,"in png_combine_row_asm\n");
114 if (mmx_supported == 2) {
115 /* this should have happened in png_init_mmx_flags() already */
116 png_warning(png_ptr, "asm_flags may not have been initialized");
117 png_mmx_support();
118 }
120 if (mask == 0xff)
121 {
122 png_memcpy(row, png_ptr->row_buf + 1,
123 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
124 }
125 /* GRR: add "else if (mask == 0)" case?
126 * or does png_combine_row() not even get called in that case? */
127 else
128 {
129 switch (png_ptr->row_info.pixel_depth)
130 {
131 case 1:
132 {
133 png_bytep sp;
134 png_bytep dp;
135 int s_inc, s_start, s_end;
136 int m;
137 int shift;
138 png_uint_32 i;
140 sp = png_ptr->row_buf + 1;
141 dp = row;
142 m = 0x80;
143 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
144 if (png_ptr->transformations & PNG_PACKSWAP)
145 {
146 s_start = 0;
147 s_end = 7;
148 s_inc = 1;
149 }
150 else
151 #endif
152 {
153 s_start = 7;
154 s_end = 0;
155 s_inc = -1;
156 }
158 shift = s_start;
160 for (i = 0; i < png_ptr->width; i++)
161 {
162 if (m & mask)
163 {
164 int value;
166 value = (*sp >> shift) & 0x1;
167 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
168 *dp |= (png_byte)(value << shift);
169 }
171 if (shift == s_end)
172 {
173 shift = s_start;
174 sp++;
175 dp++;
176 }
177 else
178 shift += s_inc;
180 if (m == 1)
181 m = 0x80;
182 else
183 m >>= 1;
184 }
185 break;
186 }
188 case 2:
189 {
190 png_bytep sp;
191 png_bytep dp;
192 int s_start, s_end, s_inc;
193 int m;
194 int shift;
195 png_uint_32 i;
196 int value;
198 sp = png_ptr->row_buf + 1;
199 dp = row;
200 m = 0x80;
201 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
202 if (png_ptr->transformations & PNG_PACKSWAP)
203 {
204 s_start = 0;
205 s_end = 6;
206 s_inc = 2;
207 }
208 else
209 #endif
210 {
211 s_start = 6;
212 s_end = 0;
213 s_inc = -2;
214 }
216 shift = s_start;
218 for (i = 0; i < png_ptr->width; i++)
219 {
220 if (m & mask)
221 {
222 value = (*sp >> shift) & 0x3;
223 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
224 *dp |= (png_byte)(value << shift);
225 }
227 if (shift == s_end)
228 {
229 shift = s_start;
230 sp++;
231 dp++;
232 }
233 else
234 shift += s_inc;
235 if (m == 1)
236 m = 0x80;
237 else
238 m >>= 1;
239 }
240 break;
241 }
243 case 4:
244 {
245 png_bytep sp;
246 png_bytep dp;
247 int s_start, s_end, s_inc;
248 int m;
249 int shift;
250 png_uint_32 i;
251 int value;
253 sp = png_ptr->row_buf + 1;
254 dp = row;
255 m = 0x80;
256 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
257 if (png_ptr->transformations & PNG_PACKSWAP)
258 {
259 s_start = 0;
260 s_end = 4;
261 s_inc = 4;
262 }
263 else
264 #endif
265 {
266 s_start = 4;
267 s_end = 0;
268 s_inc = -4;
269 }
270 shift = s_start;
272 for (i = 0; i < png_ptr->width; i++)
273 {
274 if (m & mask)
275 {
276 value = (*sp >> shift) & 0xf;
277 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
278 *dp |= (png_byte)(value << shift);
279 }
281 if (shift == s_end)
282 {
283 shift = s_start;
284 sp++;
285 dp++;
286 }
287 else
288 shift += s_inc;
289 if (m == 1)
290 m = 0x80;
291 else
292 m >>= 1;
293 }
294 break;
295 }
297 case 8:
298 {
299 png_bytep srcptr;
300 png_bytep dstptr;
301 png_uint_32 len;
302 int m;
303 int diff, unmask;
305 __int64 mask0=0x0102040810204080;
307 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
308 /* && mmx_supported */ )
309 {
310 srcptr = png_ptr->row_buf + 1;
311 dstptr = row;
312 m = 0x80;
313 unmask = ~mask;
314 len = png_ptr->width &~7; //reduce to multiple of 8
315 diff = png_ptr->width & 7; //amount lost
317 _asm
318 {
319 movd mm7, unmask //load bit pattern
320 psubb mm6,mm6 //zero mm6
321 punpcklbw mm7,mm7
322 punpcklwd mm7,mm7
323 punpckldq mm7,mm7 //fill register with 8 masks
325 movq mm0,mask0
327 pand mm0,mm7 //nonzero if keep byte
328 pcmpeqb mm0,mm6 //zeros->1s, v versa
330 mov ecx,len //load length of line (pixels)
331 mov esi,srcptr //load source
332 mov ebx,dstptr //load dest
333 cmp ecx,0 //lcr
334 je mainloop8end
336 mainloop8:
337 movq mm4,[esi]
338 pand mm4,mm0
339 movq mm6,mm0
340 pandn mm6,[ebx]
341 por mm4,mm6
342 movq [ebx],mm4
344 add esi,8 //inc by 8 bytes processed
345 add ebx,8
346 sub ecx,8 //dec by 8 pixels processed
348 ja mainloop8
349 mainloop8end:
351 mov ecx,diff
352 cmp ecx,0
353 jz end8
355 mov edx,mask
356 sal edx,24 //make low byte the high byte
358 secondloop8:
359 sal edx,1 //move high bit to CF
360 jnc skip8 //if CF = 0
361 mov al,[esi]
362 mov [ebx],al
363 skip8:
364 inc esi
365 inc ebx
367 dec ecx
368 jnz secondloop8
369 end8:
370 emms
371 }
372 }
373 else /* mmx not supported - use modified C routine */
374 {
375 register unsigned int incr1, initial_val, final_val;
376 png_size_t pixel_bytes;
377 png_uint_32 i;
378 register int disp = png_pass_inc[png_ptr->pass];
379 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
381 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
382 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
383 pixel_bytes;
384 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
385 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
386 final_val = png_ptr->width*pixel_bytes;
387 incr1 = (disp)*pixel_bytes;
388 for (i = initial_val; i < final_val; i += incr1)
389 {
390 png_memcpy(dstptr, srcptr, pixel_bytes);
391 srcptr += incr1;
392 dstptr += incr1;
393 }
394 } /* end of else */
396 break;
397 } // end 8 bpp
399 case 16:
400 {
401 png_bytep srcptr;
402 png_bytep dstptr;
403 png_uint_32 len;
404 int unmask, diff;
405 __int64 mask1=0x0101020204040808,
406 mask0=0x1010202040408080;
408 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
409 /* && mmx_supported */ )
410 {
411 srcptr = png_ptr->row_buf + 1;
412 dstptr = row;
414 unmask = ~mask;
415 len = (png_ptr->width)&~7;
416 diff = (png_ptr->width)&7;
417 _asm
418 {
419 movd mm7, unmask //load bit pattern
420 psubb mm6,mm6 //zero mm6
421 punpcklbw mm7,mm7
422 punpcklwd mm7,mm7
423 punpckldq mm7,mm7 //fill register with 8 masks
425 movq mm0,mask0
426 movq mm1,mask1
428 pand mm0,mm7
429 pand mm1,mm7
431 pcmpeqb mm0,mm6
432 pcmpeqb mm1,mm6
434 mov ecx,len //load length of line
435 mov esi,srcptr //load source
436 mov ebx,dstptr //load dest
437 cmp ecx,0 //lcr
438 jz mainloop16end
440 mainloop16:
441 movq mm4,[esi]
442 pand mm4,mm0
443 movq mm6,mm0
444 movq mm7,[ebx]
445 pandn mm6,mm7
446 por mm4,mm6
447 movq [ebx],mm4
449 movq mm5,[esi+8]
450 pand mm5,mm1
451 movq mm7,mm1
452 movq mm6,[ebx+8]
453 pandn mm7,mm6
454 por mm5,mm7
455 movq [ebx+8],mm5
457 add esi,16 //inc by 16 bytes processed
458 add ebx,16
459 sub ecx,8 //dec by 8 pixels processed
461 ja mainloop16
463 mainloop16end:
464 mov ecx,diff
465 cmp ecx,0
466 jz end16
468 mov edx,mask
469 sal edx,24 //make low byte the high byte
470 secondloop16:
471 sal edx,1 //move high bit to CF
472 jnc skip16 //if CF = 0
473 mov ax,[esi]
474 mov [ebx],ax
475 skip16:
476 add esi,2
477 add ebx,2
479 dec ecx
480 jnz secondloop16
481 end16:
482 emms
483 }
484 }
485 else /* mmx not supported - use modified C routine */
486 {
487 register unsigned int incr1, initial_val, final_val;
488 png_size_t pixel_bytes;
489 png_uint_32 i;
490 register int disp = png_pass_inc[png_ptr->pass];
491 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
493 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
494 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
495 pixel_bytes;
496 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
497 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
498 final_val = png_ptr->width*pixel_bytes;
499 incr1 = (disp)*pixel_bytes;
500 for (i = initial_val; i < final_val; i += incr1)
501 {
502 png_memcpy(dstptr, srcptr, pixel_bytes);
503 srcptr += incr1;
504 dstptr += incr1;
505 }
506 } /* end of else */
508 break;
509 } // end 16 bpp
511 case 24:
512 {
513 png_bytep srcptr;
514 png_bytep dstptr;
515 png_uint_32 len;
516 int unmask, diff;
518 __int64 mask2=0x0101010202020404, //24bpp
519 mask1=0x0408080810101020,
520 mask0=0x2020404040808080;
522 srcptr = png_ptr->row_buf + 1;
523 dstptr = row;
525 unmask = ~mask;
526 len = (png_ptr->width)&~7;
527 diff = (png_ptr->width)&7;
529 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
530 /* && mmx_supported */ )
531 {
532 _asm
533 {
534 movd mm7, unmask //load bit pattern
535 psubb mm6,mm6 //zero mm6
536 punpcklbw mm7,mm7
537 punpcklwd mm7,mm7
538 punpckldq mm7,mm7 //fill register with 8 masks
540 movq mm0,mask0
541 movq mm1,mask1
542 movq mm2,mask2
544 pand mm0,mm7
545 pand mm1,mm7
546 pand mm2,mm7
548 pcmpeqb mm0,mm6
549 pcmpeqb mm1,mm6
550 pcmpeqb mm2,mm6
552 mov ecx,len //load length of line
553 mov esi,srcptr //load source
554 mov ebx,dstptr //load dest
555 cmp ecx,0
556 jz mainloop24end
558 mainloop24:
559 movq mm4,[esi]
560 pand mm4,mm0
561 movq mm6,mm0
562 movq mm7,[ebx]
563 pandn mm6,mm7
564 por mm4,mm6
565 movq [ebx],mm4
568 movq mm5,[esi+8]
569 pand mm5,mm1
570 movq mm7,mm1
571 movq mm6,[ebx+8]
572 pandn mm7,mm6
573 por mm5,mm7
574 movq [ebx+8],mm5
576 movq mm6,[esi+16]
577 pand mm6,mm2
578 movq mm4,mm2
579 movq mm7,[ebx+16]
580 pandn mm4,mm7
581 por mm6,mm4
582 movq [ebx+16],mm6
584 add esi,24 //inc by 24 bytes processed
585 add ebx,24
586 sub ecx,8 //dec by 8 pixels processed
588 ja mainloop24
590 mainloop24end:
591 mov ecx,diff
592 cmp ecx,0
593 jz end24
595 mov edx,mask
596 sal edx,24 //make low byte the high byte
597 secondloop24:
598 sal edx,1 //move high bit to CF
599 jnc skip24 //if CF = 0
600 mov ax,[esi]
601 mov [ebx],ax
602 xor eax,eax
603 mov al,[esi+2]
604 mov [ebx+2],al
605 skip24:
606 add esi,3
607 add ebx,3
609 dec ecx
610 jnz secondloop24
612 end24:
613 emms
614 }
615 }
616 else /* mmx not supported - use modified C routine */
617 {
618 register unsigned int incr1, initial_val, final_val;
619 png_size_t pixel_bytes;
620 png_uint_32 i;
621 register int disp = png_pass_inc[png_ptr->pass];
622 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
624 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
625 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
626 pixel_bytes;
627 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
628 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
629 final_val = png_ptr->width*pixel_bytes;
630 incr1 = (disp)*pixel_bytes;
631 for (i = initial_val; i < final_val; i += incr1)
632 {
633 png_memcpy(dstptr, srcptr, pixel_bytes);
634 srcptr += incr1;
635 dstptr += incr1;
636 }
637 } /* end of else */
639 break;
640 } // end 24 bpp
642 case 32:
643 {
644 png_bytep srcptr;
645 png_bytep dstptr;
646 png_uint_32 len;
647 int unmask, diff;
649 __int64 mask3=0x0101010102020202, //32bpp
650 mask2=0x0404040408080808,
651 mask1=0x1010101020202020,
652 mask0=0x4040404080808080;
654 srcptr = png_ptr->row_buf + 1;
655 dstptr = row;
657 unmask = ~mask;
658 len = (png_ptr->width)&~7;
659 diff = (png_ptr->width)&7;
661 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
662 /* && mmx_supported */ )
663 {
664 _asm
665 {
666 movd mm7, unmask //load bit pattern
667 psubb mm6,mm6 //zero mm6
668 punpcklbw mm7,mm7
669 punpcklwd mm7,mm7
670 punpckldq mm7,mm7 //fill register with 8 masks
672 movq mm0,mask0
673 movq mm1,mask1
674 movq mm2,mask2
675 movq mm3,mask3
677 pand mm0,mm7
678 pand mm1,mm7
679 pand mm2,mm7
680 pand mm3,mm7
682 pcmpeqb mm0,mm6
683 pcmpeqb mm1,mm6
684 pcmpeqb mm2,mm6
685 pcmpeqb mm3,mm6
687 mov ecx,len //load length of line
688 mov esi,srcptr //load source
689 mov ebx,dstptr //load dest
691 cmp ecx,0 //lcr
692 jz mainloop32end
694 mainloop32:
695 movq mm4,[esi]
696 pand mm4,mm0
697 movq mm6,mm0
698 movq mm7,[ebx]
699 pandn mm6,mm7
700 por mm4,mm6
701 movq [ebx],mm4
703 movq mm5,[esi+8]
704 pand mm5,mm1
705 movq mm7,mm1
706 movq mm6,[ebx+8]
707 pandn mm7,mm6
708 por mm5,mm7
709 movq [ebx+8],mm5
711 movq mm6,[esi+16]
712 pand mm6,mm2
713 movq mm4,mm2
714 movq mm7,[ebx+16]
715 pandn mm4,mm7
716 por mm6,mm4
717 movq [ebx+16],mm6
719 movq mm7,[esi+24]
720 pand mm7,mm3
721 movq mm5,mm3
722 movq mm4,[ebx+24]
723 pandn mm5,mm4
724 por mm7,mm5
725 movq [ebx+24],mm7
727 add esi,32 //inc by 32 bytes processed
728 add ebx,32
729 sub ecx,8 //dec by 8 pixels processed
731 ja mainloop32
733 mainloop32end:
734 mov ecx,diff
735 cmp ecx,0
736 jz end32
738 mov edx,mask
739 sal edx,24 //make low byte the high byte
740 secondloop32:
741 sal edx,1 //move high bit to CF
742 jnc skip32 //if CF = 0
743 mov eax,[esi]
744 mov [ebx],eax
745 skip32:
746 add esi,4
747 add ebx,4
749 dec ecx
750 jnz secondloop32
752 end32:
753 emms
754 }
755 }
756 else /* mmx _not supported - Use modified C routine */
757 {
758 register unsigned int incr1, initial_val, final_val;
759 png_size_t pixel_bytes;
760 png_uint_32 i;
761 register int disp = png_pass_inc[png_ptr->pass];
762 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
764 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
765 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
766 pixel_bytes;
767 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
768 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
769 final_val = png_ptr->width*pixel_bytes;
770 incr1 = (disp)*pixel_bytes;
771 for (i = initial_val; i < final_val; i += incr1)
772 {
773 png_memcpy(dstptr, srcptr, pixel_bytes);
774 srcptr += incr1;
775 dstptr += incr1;
776 }
777 } /* end of else */
779 break;
780 } // end 32 bpp
782 case 48:
783 {
784 png_bytep srcptr;
785 png_bytep dstptr;
786 png_uint_32 len;
787 int unmask, diff;
789 __int64 mask5=0x0101010101010202,
790 mask4=0x0202020204040404,
791 mask3=0x0404080808080808,
792 mask2=0x1010101010102020,
793 mask1=0x2020202040404040,
794 mask0=0x4040808080808080;
796 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
797 /* && mmx_supported */ )
798 {
799 srcptr = png_ptr->row_buf + 1;
800 dstptr = row;
802 unmask = ~mask;
803 len = (png_ptr->width)&~7;
804 diff = (png_ptr->width)&7;
805 _asm
806 {
807 movd mm7, unmask //load bit pattern
808 psubb mm6,mm6 //zero mm6
809 punpcklbw mm7,mm7
810 punpcklwd mm7,mm7
811 punpckldq mm7,mm7 //fill register with 8 masks
813 movq mm0,mask0
814 movq mm1,mask1
815 movq mm2,mask2
816 movq mm3,mask3
817 movq mm4,mask4
818 movq mm5,mask5
820 pand mm0,mm7
821 pand mm1,mm7
822 pand mm2,mm7
823 pand mm3,mm7
824 pand mm4,mm7
825 pand mm5,mm7
827 pcmpeqb mm0,mm6
828 pcmpeqb mm1,mm6
829 pcmpeqb mm2,mm6
830 pcmpeqb mm3,mm6
831 pcmpeqb mm4,mm6
832 pcmpeqb mm5,mm6
834 mov ecx,len //load length of line
835 mov esi,srcptr //load source
836 mov ebx,dstptr //load dest
838 cmp ecx,0
839 jz mainloop48end
841 mainloop48:
842 movq mm7,[esi]
843 pand mm7,mm0
844 movq mm6,mm0
845 pandn mm6,[ebx]
846 por mm7,mm6
847 movq [ebx],mm7
849 movq mm6,[esi+8]
850 pand mm6,mm1
851 movq mm7,mm1
852 pandn mm7,[ebx+8]
853 por mm6,mm7
854 movq [ebx+8],mm6
856 movq mm6,[esi+16]
857 pand mm6,mm2
858 movq mm7,mm2
859 pandn mm7,[ebx+16]
860 por mm6,mm7
861 movq [ebx+16],mm6
863 movq mm7,[esi+24]
864 pand mm7,mm3
865 movq mm6,mm3
866 pandn mm6,[ebx+24]
867 por mm7,mm6
868 movq [ebx+24],mm7
870 movq mm6,[esi+32]
871 pand mm6,mm4
872 movq mm7,mm4
873 pandn mm7,[ebx+32]
874 por mm6,mm7
875 movq [ebx+32],mm6
877 movq mm7,[esi+40]
878 pand mm7,mm5
879 movq mm6,mm5
880 pandn mm6,[ebx+40]
881 por mm7,mm6
882 movq [ebx+40],mm7
884 add esi,48 //inc by 32 bytes processed
885 add ebx,48
886 sub ecx,8 //dec by 8 pixels processed
888 ja mainloop48
889 mainloop48end:
891 mov ecx,diff
892 cmp ecx,0
893 jz end48
895 mov edx,mask
896 sal edx,24 //make low byte the high byte
898 secondloop48:
899 sal edx,1 //move high bit to CF
900 jnc skip48 //if CF = 0
901 mov eax,[esi]
902 mov [ebx],eax
903 skip48:
904 add esi,4
905 add ebx,4
907 dec ecx
908 jnz secondloop48
910 end48:
911 emms
912 }
913 }
914 else /* mmx _not supported - Use modified C routine */
915 {
916 register unsigned int incr1, initial_val, final_val;
917 png_size_t pixel_bytes;
918 png_uint_32 i;
919 register int disp = png_pass_inc[png_ptr->pass];
920 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
922 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
923 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
924 pixel_bytes;
925 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
926 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
927 final_val = png_ptr->width*pixel_bytes;
928 incr1 = (disp)*pixel_bytes;
929 for (i = initial_val; i < final_val; i += incr1)
930 {
931 png_memcpy(dstptr, srcptr, pixel_bytes);
932 srcptr += incr1;
933 dstptr += incr1;
934 }
935 } /* end of else */
937 break;
938 } // end 48 bpp
940 default:
941 {
942 png_bytep sptr;
943 png_bytep dp;
944 png_size_t pixel_bytes;
945 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
946 unsigned int i;
947 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
948 register unsigned int incr1, initial_val, final_val;
950 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
951 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
952 pixel_bytes;
953 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
954 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
955 final_val = png_ptr->width*pixel_bytes;
956 incr1 = (disp)*pixel_bytes;
957 for (i = initial_val; i < final_val; i += incr1)
958 {
959 png_memcpy(dp, sptr, pixel_bytes);
960 sptr += incr1;
961 dp += incr1;
962 }
963 break;
964 }
965 } /* end switch (png_ptr->row_info.pixel_depth) */
966 } /* end if (non-trivial mask) */
968 } /* end png_combine_row() */
971 #if defined(PNG_READ_INTERLACING_SUPPORTED)
973 void /* PRIVATE */
974 png_do_read_interlace(png_structp png_ptr)
975 {
976 png_row_infop row_info = &(png_ptr->row_info);
977 png_bytep row = png_ptr->row_buf + 1;
978 int pass = png_ptr->pass;
979 png_uint_32 transformations = png_ptr->transformations;
980 #ifdef PNG_USE_LOCAL_ARRAYS
981 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
982 #endif
984 png_debug(1,"in png_do_read_interlace\n");
986 if (mmx_supported == 2) {
987 /* this should have happened in png_init_mmx_flags() already */
988 png_warning(png_ptr, "asm_flags may not have been initialized");
989 png_mmx_support();
990 }
992 if (row != NULL && row_info != NULL)
993 {
994 png_uint_32 final_width;
996 final_width = row_info->width * png_pass_inc[pass];
998 switch (row_info->pixel_depth)
999 {
1000 case 1:
1001 {
1002 png_bytep sp, dp;
1003 int sshift, dshift;
1004 int s_start, s_end, s_inc;
1005 png_byte v;
1006 png_uint_32 i;
1007 int j;
1009 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1010 dp = row + (png_size_t)((final_width - 1) >> 3);
1011 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1012 if (transformations & PNG_PACKSWAP)
1013 {
1014 sshift = (int)((row_info->width + 7) & 7);
1015 dshift = (int)((final_width + 7) & 7);
1016 s_start = 7;
1017 s_end = 0;
1018 s_inc = -1;
1019 }
1020 else
1021 #endif
1022 {
1023 sshift = 7 - (int)((row_info->width + 7) & 7);
1024 dshift = 7 - (int)((final_width + 7) & 7);
1025 s_start = 0;
1026 s_end = 7;
1027 s_inc = 1;
1028 }
1030 for (i = row_info->width; i; i--)
1031 {
1032 v = (png_byte)((*sp >> sshift) & 0x1);
1033 for (j = 0; j < png_pass_inc[pass]; j++)
1034 {
1035 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1036 *dp |= (png_byte)(v << dshift);
1037 if (dshift == s_end)
1038 {
1039 dshift = s_start;
1040 dp--;
1041 }
1042 else
1043 dshift += s_inc;
1044 }
1045 if (sshift == s_end)
1046 {
1047 sshift = s_start;
1048 sp--;
1049 }
1050 else
1051 sshift += s_inc;
1052 }
1053 break;
1054 }
1056 case 2:
1057 {
1058 png_bytep sp, dp;
1059 int sshift, dshift;
1060 int s_start, s_end, s_inc;
1061 png_uint_32 i;
1063 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1064 dp = row + (png_size_t)((final_width - 1) >> 2);
1065 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1066 if (transformations & PNG_PACKSWAP)
1067 {
1068 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1069 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1070 s_start = 6;
1071 s_end = 0;
1072 s_inc = -2;
1073 }
1074 else
1075 #endif
1076 {
1077 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1078 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1079 s_start = 0;
1080 s_end = 6;
1081 s_inc = 2;
1082 }
1084 for (i = row_info->width; i; i--)
1085 {
1086 png_byte v;
1087 int j;
1089 v = (png_byte)((*sp >> sshift) & 0x3);
1090 for (j = 0; j < png_pass_inc[pass]; j++)
1091 {
1092 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1093 *dp |= (png_byte)(v << dshift);
1094 if (dshift == s_end)
1095 {
1096 dshift = s_start;
1097 dp--;
1098 }
1099 else
1100 dshift += s_inc;
1101 }
1102 if (sshift == s_end)
1103 {
1104 sshift = s_start;
1105 sp--;
1106 }
1107 else
1108 sshift += s_inc;
1109 }
1110 break;
1111 }
1113 case 4:
1114 {
1115 png_bytep sp, dp;
1116 int sshift, dshift;
1117 int s_start, s_end, s_inc;
1118 png_uint_32 i;
1120 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1121 dp = row + (png_size_t)((final_width - 1) >> 1);
1122 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1123 if (transformations & PNG_PACKSWAP)
1124 {
1125 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1126 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1127 s_start = 4;
1128 s_end = 0;
1129 s_inc = -4;
1130 }
1131 else
1132 #endif
1133 {
1134 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1135 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1136 s_start = 0;
1137 s_end = 4;
1138 s_inc = 4;
1139 }
1141 for (i = row_info->width; i; i--)
1142 {
1143 png_byte v;
1144 int j;
1146 v = (png_byte)((*sp >> sshift) & 0xf);
1147 for (j = 0; j < png_pass_inc[pass]; j++)
1148 {
1149 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1150 *dp |= (png_byte)(v << dshift);
1151 if (dshift == s_end)
1152 {
1153 dshift = s_start;
1154 dp--;
1155 }
1156 else
1157 dshift += s_inc;
1158 }
1159 if (sshift == s_end)
1160 {
1161 sshift = s_start;
1162 sp--;
1163 }
1164 else
1165 sshift += s_inc;
1166 }
1167 break;
1168 }
1170 default: // This is the place where the routine is modified
1171 {
1172 __int64 const4 = 0x0000000000FFFFFF;
1173 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1174 __int64 const6 = 0x00000000000000FF;
1175 png_bytep sptr, dp;
1176 png_uint_32 i;
1177 png_size_t pixel_bytes;
1178 int width = row_info->width;
1180 pixel_bytes = (row_info->pixel_depth >> 3);
1182 sptr = row + (width - 1) * pixel_bytes;
1183 dp = row + (final_width - 1) * pixel_bytes;
1184 // New code by Nirav Chhatrapati - Intel Corporation
1185 // sign fix by GRR
1186 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1188 // use MMX routine if machine supports it
1189 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1190 /* && mmx_supported */ )
1191 {
1192 if (pixel_bytes == 3)
1193 {
1194 if (((pass == 0) || (pass == 1)) && width)
1195 {
1196 _asm
1197 {
1198 mov esi, sptr
1199 mov edi, dp
1200 mov ecx, width
1201 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1202 loop_pass0:
1203 movd mm0, [esi] ; X X X X X v2 v1 v0
1204 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1205 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1206 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1207 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1208 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1209 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1210 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1211 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1212 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1213 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1214 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1215 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1216 movq [edi+16] , mm4
1217 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1218 movq [edi+8] , mm3
1219 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1220 sub esi, 3
1221 movq [edi], mm0
1222 sub edi, 24
1223 //sub esi, 3
1224 dec ecx
1225 jnz loop_pass0
1226 EMMS
1227 }
1228 }
1229 else if (((pass == 2) || (pass == 3)) && width)
1230 {
1231 _asm
1232 {
1233 mov esi, sptr
1234 mov edi, dp
1235 mov ecx, width
1236 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1237 loop_pass2:
1238 movd mm0, [esi] ; X X X X X v2 v1 v0
1239 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1240 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1241 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1242 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1243 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1244 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1245 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1246 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1247 movq [edi+4], mm0 ; move to memory
1248 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1249 movd [edi], mm0 ; move to memory
1250 sub esi, 3
1251 sub edi, 12
1252 dec ecx
1253 jnz loop_pass2
1254 EMMS
1255 }
1256 }
1257 else if (width) /* && ((pass == 4) || (pass == 5)) */
1258 {
1259 int width_mmx = ((width >> 1) << 1) - 8;
1260 if (width_mmx < 0)
1261 width_mmx = 0;
1262 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1263 if (width_mmx)
1264 {
1265 _asm
1266 {
1267 mov esi, sptr
1268 mov edi, dp
1269 mov ecx, width_mmx
1270 sub esi, 3
1271 sub edi, 9
1272 loop_pass4:
1273 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1274 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1275 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1276 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1277 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1278 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1279 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1280 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1281 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1282 movq [edi], mm0 ; move quad to memory
1283 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1284 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1285 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1286 movd [edi+8], mm6 ; move double to memory
1287 sub esi, 6
1288 sub edi, 12
1289 sub ecx, 2
1290 jnz loop_pass4
1291 EMMS
1292 }
1293 }
1295 sptr -= width_mmx*3;
1296 dp -= width_mmx*6;
1297 for (i = width; i; i--)
1298 {
1299 png_byte v[8];
1300 int j;
1302 png_memcpy(v, sptr, 3);
1303 for (j = 0; j < png_pass_inc[pass]; j++)
1304 {
1305 png_memcpy(dp, v, 3);
1306 dp -= 3;
1307 }
1308 sptr -= 3;
1309 }
1310 }
1311 } /* end of pixel_bytes == 3 */
1313 else if (pixel_bytes == 1)
1314 {
1315 if (((pass == 0) || (pass == 1)) && width)
1316 {
1317 int width_mmx = ((width >> 2) << 2);
1318 width -= width_mmx;
1319 if (width_mmx)
1320 {
1321 _asm
1322 {
1323 mov esi, sptr
1324 mov edi, dp
1325 mov ecx, width_mmx
1326 sub edi, 31
1327 sub esi, 3
1328 loop1_pass0:
1329 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1330 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1331 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1332 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1333 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1334 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1335 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1336 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1337 movq [edi], mm0 ; move to memory v3
1338 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1339 movq [edi+8], mm3 ; move to memory v2
1340 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1341 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1342 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1343 movq [edi+16], mm2 ; move to memory v1
1344 movq [edi+24], mm4 ; move to memory v0
1345 sub esi, 4
1346 sub edi, 32
1347 sub ecx, 4
1348 jnz loop1_pass0
1349 EMMS
1350 }
1351 }
1353 sptr -= width_mmx;
1354 dp -= width_mmx*8;
1355 for (i = width; i; i--)
1356 {
1357 int j;
1359 /* I simplified this part in version 1.0.4e
1360 * here and in several other instances where
1361 * pixel_bytes == 1 -- GR-P
1362 *
1363 * Original code:
1364 *
1365 * png_byte v[8];
1366 * png_memcpy(v, sptr, pixel_bytes);
1367 * for (j = 0; j < png_pass_inc[pass]; j++)
1368 * {
1369 * png_memcpy(dp, v, pixel_bytes);
1370 * dp -= pixel_bytes;
1371 * }
1372 * sptr -= pixel_bytes;
1373 *
1374 * Replacement code is in the next three lines:
1375 */
1377 for (j = 0; j < png_pass_inc[pass]; j++)
1378 *dp-- = *sptr;
1379 sptr--;
1380 }
1381 }
1382 else if (((pass == 2) || (pass == 3)) && width)
1383 {
1384 int width_mmx = ((width >> 2) << 2);
1385 width -= width_mmx;
1386 if (width_mmx)
1387 {
1388 _asm
1389 {
1390 mov esi, sptr
1391 mov edi, dp
1392 mov ecx, width_mmx
1393 sub edi, 15
1394 sub esi, 3
1395 loop1_pass2:
1396 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1397 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1398 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1399 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1400 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1401 movq [edi], mm0 ; move to memory v2 and v3
1402 sub esi, 4
1403 movq [edi+8], mm1 ; move to memory v1 and v0
1404 sub edi, 16
1405 sub ecx, 4
1406 jnz loop1_pass2
1407 EMMS
1408 }
1409 }
1411 sptr -= width_mmx;
1412 dp -= width_mmx*4;
1413 for (i = width; i; i--)
1414 {
1415 int j;
1417 for (j = 0; j < png_pass_inc[pass]; j++)
1418 {
1419 *dp-- = *sptr;
1420 }
1421 sptr --;
1422 }
1423 }
1424 else if (width) /* && ((pass == 4) || (pass == 5))) */
1425 {
1426 int width_mmx = ((width >> 3) << 3);
1427 width -= width_mmx;
1428 if (width_mmx)
1429 {
1430 _asm
1431 {
1432 mov esi, sptr
1433 mov edi, dp
1434 mov ecx, width_mmx
1435 sub edi, 15
1436 sub esi, 7
1437 loop1_pass4:
1438 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1439 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1440 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1441 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1442 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1443 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1444 sub esi, 8
1445 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1446 //sub esi, 4
1447 sub edi, 16
1448 sub ecx, 8
1449 jnz loop1_pass4
1450 EMMS
1451 }
1452 }
1454 sptr -= width_mmx;
1455 dp -= width_mmx*2;
1456 for (i = width; i; i--)
1457 {
1458 int j;
1460 for (j = 0; j < png_pass_inc[pass]; j++)
1461 {
1462 *dp-- = *sptr;
1463 }
1464 sptr --;
1465 }
1466 }
1467 } /* end of pixel_bytes == 1 */
1469 else if (pixel_bytes == 2)
1470 {
1471 if (((pass == 0) || (pass == 1)) && width)
1472 {
1473 int width_mmx = ((width >> 1) << 1);
1474 width -= width_mmx;
1475 if (width_mmx)
1476 {
1477 _asm
1478 {
1479 mov esi, sptr
1480 mov edi, dp
1481 mov ecx, width_mmx
1482 sub esi, 2
1483 sub edi, 30
1484 loop2_pass0:
1485 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1486 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1487 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1488 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1489 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1490 movq [edi], mm0
1491 movq [edi + 8], mm0
1492 movq [edi + 16], mm1
1493 movq [edi + 24], mm1
1494 sub esi, 4
1495 sub edi, 32
1496 sub ecx, 2
1497 jnz loop2_pass0
1498 EMMS
1499 }
1500 }
1502 sptr -= (width_mmx*2 - 2); // sign fixed
1503 dp -= (width_mmx*16 - 2); // sign fixed
1504 for (i = width; i; i--)
1505 {
1506 png_byte v[8];
1507 int j;
1508 sptr -= 2;
1509 png_memcpy(v, sptr, 2);
1510 for (j = 0; j < png_pass_inc[pass]; j++)
1511 {
1512 dp -= 2;
1513 png_memcpy(dp, v, 2);
1514 }
1515 }
1516 }
1517 else if (((pass == 2) || (pass == 3)) && width)
1518 {
1519 int width_mmx = ((width >> 1) << 1) ;
1520 width -= width_mmx;
1521 if (width_mmx)
1522 {
1523 _asm
1524 {
1525 mov esi, sptr
1526 mov edi, dp
1527 mov ecx, width_mmx
1528 sub esi, 2
1529 sub edi, 14
1530 loop2_pass2:
1531 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1532 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1533 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1534 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1535 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1536 movq [edi], mm0
1537 sub esi, 4
1538 movq [edi + 8], mm1
1539 //sub esi, 4
1540 sub edi, 16
1541 sub ecx, 2
1542 jnz loop2_pass2
1543 EMMS
1544 }
1545 }
1547 sptr -= (width_mmx*2 - 2); // sign fixed
1548 dp -= (width_mmx*8 - 2); // sign fixed
1549 for (i = width; i; i--)
1550 {
1551 png_byte v[8];
1552 int j;
1553 sptr -= 2;
1554 png_memcpy(v, sptr, 2);
1555 for (j = 0; j < png_pass_inc[pass]; j++)
1556 {
1557 dp -= 2;
1558 png_memcpy(dp, v, 2);
1559 }
1560 }
1561 }
1562 else if (width) // pass == 4 or 5
1563 {
1564 int width_mmx = ((width >> 1) << 1) ;
1565 width -= width_mmx;
1566 if (width_mmx)
1567 {
1568 _asm
1569 {
1570 mov esi, sptr
1571 mov edi, dp
1572 mov ecx, width_mmx
1573 sub esi, 2
1574 sub edi, 6
1575 loop2_pass4:
1576 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1577 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1578 sub esi, 4
1579 movq [edi], mm0
1580 sub edi, 8
1581 sub ecx, 2
1582 jnz loop2_pass4
1583 EMMS
1584 }
1585 }
1587 sptr -= (width_mmx*2 - 2); // sign fixed
1588 dp -= (width_mmx*4 - 2); // sign fixed
1589 for (i = width; i; i--)
1590 {
1591 png_byte v[8];
1592 int j;
1593 sptr -= 2;
1594 png_memcpy(v, sptr, 2);
1595 for (j = 0; j < png_pass_inc[pass]; j++)
1596 {
1597 dp -= 2;
1598 png_memcpy(dp, v, 2);
1599 }
1600 }
1601 }
1602 } /* end of pixel_bytes == 2 */
1604 else if (pixel_bytes == 4)
1605 {
1606 if (((pass == 0) || (pass == 1)) && width)
1607 {
1608 int width_mmx = ((width >> 1) << 1) ;
1609 width -= width_mmx;
1610 if (width_mmx)
1611 {
1612 _asm
1613 {
1614 mov esi, sptr
1615 mov edi, dp
1616 mov ecx, width_mmx
1617 sub esi, 4
1618 sub edi, 60
1619 loop4_pass0:
1620 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1621 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1622 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1623 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1624 movq [edi], mm0
1625 movq [edi + 8], mm0
1626 movq [edi + 16], mm0
1627 movq [edi + 24], mm0
1628 movq [edi+32], mm1
1629 movq [edi + 40], mm1
1630 movq [edi+ 48], mm1
1631 sub esi, 8
1632 movq [edi + 56], mm1
1633 sub edi, 64
1634 sub ecx, 2
1635 jnz loop4_pass0
1636 EMMS
1637 }
1638 }
1640 sptr -= (width_mmx*4 - 4); // sign fixed
1641 dp -= (width_mmx*32 - 4); // sign fixed
1642 for (i = width; i; i--)
1643 {
1644 png_byte v[8];
1645 int j;
1646 sptr -= 4;
1647 png_memcpy(v, sptr, 4);
1648 for (j = 0; j < png_pass_inc[pass]; j++)
1649 {
1650 dp -= 4;
1651 png_memcpy(dp, v, 4);
1652 }
1653 }
1654 }
1655 else if (((pass == 2) || (pass == 3)) && width)
1656 {
1657 int width_mmx = ((width >> 1) << 1) ;
1658 width -= width_mmx;
1659 if (width_mmx)
1660 {
1661 _asm
1662 {
1663 mov esi, sptr
1664 mov edi, dp
1665 mov ecx, width_mmx
1666 sub esi, 4
1667 sub edi, 28
1668 loop4_pass2:
1669 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1670 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1671 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1672 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1673 movq [edi], mm0
1674 movq [edi + 8], mm0
1675 movq [edi+16], mm1
1676 movq [edi + 24], mm1
1677 sub esi, 8
1678 sub edi, 32
1679 sub ecx, 2
1680 jnz loop4_pass2
1681 EMMS
1682 }
1683 }
1685 sptr -= (width_mmx*4 - 4); // sign fixed
1686 dp -= (width_mmx*16 - 4); // sign fixed
1687 for (i = width; i; i--)
1688 {
1689 png_byte v[8];
1690 int j;
1691 sptr -= 4;
1692 png_memcpy(v, sptr, 4);
1693 for (j = 0; j < png_pass_inc[pass]; j++)
1694 {
1695 dp -= 4;
1696 png_memcpy(dp, v, 4);
1697 }
1698 }
1699 }
1700 else if (width) // pass == 4 or 5
1701 {
1702 int width_mmx = ((width >> 1) << 1) ;
1703 width -= width_mmx;
1704 if (width_mmx)
1705 {
1706 _asm
1707 {
1708 mov esi, sptr
1709 mov edi, dp
1710 mov ecx, width_mmx
1711 sub esi, 4
1712 sub edi, 12
1713 loop4_pass4:
1714 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1715 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1716 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1717 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1718 movq [edi], mm0
1719 sub esi, 8
1720 movq [edi + 8], mm1
1721 sub edi, 16
1722 sub ecx, 2
1723 jnz loop4_pass4
1724 EMMS
1725 }
1726 }
1728 sptr -= (width_mmx*4 - 4); // sign fixed
1729 dp -= (width_mmx*8 - 4); // sign fixed
1730 for (i = width; i; i--)
1731 {
1732 png_byte v[8];
1733 int j;
1734 sptr -= 4;
1735 png_memcpy(v, sptr, 4);
1736 for (j = 0; j < png_pass_inc[pass]; j++)
1737 {
1738 dp -= 4;
1739 png_memcpy(dp, v, 4);
1740 }
1741 }
1742 }
1744 } /* end of pixel_bytes == 4 */
1746 else if (pixel_bytes == 6)
1747 {
1748 for (i = width; i; i--)
1749 {
1750 png_byte v[8];
1751 int j;
1752 png_memcpy(v, sptr, 6);
1753 for (j = 0; j < png_pass_inc[pass]; j++)
1754 {
1755 png_memcpy(dp, v, 6);
1756 dp -= 6;
1757 }
1758 sptr -= 6;
1759 }
1760 } /* end of pixel_bytes == 6 */
1762 else
1763 {
1764 for (i = width; i; i--)
1765 {
1766 png_byte v[8];
1767 int j;
1768 png_memcpy(v, sptr, pixel_bytes);
1769 for (j = 0; j < png_pass_inc[pass]; j++)
1770 {
1771 png_memcpy(dp, v, pixel_bytes);
1772 dp -= pixel_bytes;
1773 }
1774 sptr-= pixel_bytes;
1775 }
1776 }
1777 } /* end of mmx_supported */
1779 else /* MMX not supported: use modified C code - takes advantage
1780 * of inlining of memcpy for a constant */
1781 {
1782 if (pixel_bytes == 1)
1783 {
1784 for (i = width; i; i--)
1785 {
1786 int j;
1787 for (j = 0; j < png_pass_inc[pass]; j++)
1788 *dp-- = *sptr;
1789 sptr--;
1790 }
1791 }
1792 else if (pixel_bytes == 3)
1793 {
1794 for (i = width; i; i--)
1795 {
1796 png_byte v[8];
1797 int j;
1798 png_memcpy(v, sptr, pixel_bytes);
1799 for (j = 0; j < png_pass_inc[pass]; j++)
1800 {
1801 png_memcpy(dp, v, pixel_bytes);
1802 dp -= pixel_bytes;
1803 }
1804 sptr -= pixel_bytes;
1805 }
1806 }
1807 else if (pixel_bytes == 2)
1808 {
1809 for (i = width; i; i--)
1810 {
1811 png_byte v[8];
1812 int j;
1813 png_memcpy(v, sptr, pixel_bytes);
1814 for (j = 0; j < png_pass_inc[pass]; j++)
1815 {
1816 png_memcpy(dp, v, pixel_bytes);
1817 dp -= pixel_bytes;
1818 }
1819 sptr -= pixel_bytes;
1820 }
1821 }
1822 else if (pixel_bytes == 4)
1823 {
1824 for (i = width; i; i--)
1825 {
1826 png_byte v[8];
1827 int j;
1828 png_memcpy(v, sptr, pixel_bytes);
1829 for (j = 0; j < png_pass_inc[pass]; j++)
1830 {
1831 png_memcpy(dp, v, pixel_bytes);
1832 dp -= pixel_bytes;
1833 }
1834 sptr -= pixel_bytes;
1835 }
1836 }
1837 else if (pixel_bytes == 6)
1838 {
1839 for (i = width; i; i--)
1840 {
1841 png_byte v[8];
1842 int j;
1843 png_memcpy(v, sptr, pixel_bytes);
1844 for (j = 0; j < png_pass_inc[pass]; j++)
1845 {
1846 png_memcpy(dp, v, pixel_bytes);
1847 dp -= pixel_bytes;
1848 }
1849 sptr -= pixel_bytes;
1850 }
1851 }
1852 else
1853 {
1854 for (i = width; i; i--)
1855 {
1856 png_byte v[8];
1857 int j;
1858 png_memcpy(v, sptr, pixel_bytes);
1859 for (j = 0; j < png_pass_inc[pass]; j++)
1860 {
1861 png_memcpy(dp, v, pixel_bytes);
1862 dp -= pixel_bytes;
1863 }
1864 sptr -= pixel_bytes;
1865 }
1866 }
1868 } /* end of MMX not supported */
1869 break;
1870 }
1871 } /* end switch (row_info->pixel_depth) */
1873 row_info->width = final_width;
1874 row_info->rowbytes = ((final_width *
1875 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1876 }
1878 }
1880 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1883 // These variables are utilized in the functions below. They are declared
1884 // globally here to ensure alignment on 8-byte boundaries.
1886 union uAll {
1887 __int64 use;
1888 double align;
1889 } LBCarryMask = {0x0101010101010101},
1890 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1891 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1894 // Optimized code for PNG Average filter decoder
1895 void /* PRIVATE */
1896 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1897 , png_bytep prev_row)
1898 {
1899 int bpp;
1900 png_uint_32 FullLength;
1901 png_uint_32 MMXLength;
1902 //png_uint_32 len;
1903 int diff;
1905 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1906 FullLength = row_info->rowbytes; // # of bytes to filter
1907 _asm {
1908 // Init address pointers and offset
1909 mov edi, row // edi ==> Avg(x)
1910 xor ebx, ebx // ebx ==> x
1911 mov edx, edi
1912 mov esi, prev_row // esi ==> Prior(x)
1913 sub edx, bpp // edx ==> Raw(x-bpp)
1915 xor eax, eax
1916 // Compute the Raw value for the first bpp bytes
1917 // Raw(x) = Avg(x) + (Prior(x)/2)
1918 davgrlp:
1919 mov al, [esi + ebx] // Load al with Prior(x)
1920 inc ebx
1921 shr al, 1 // divide by 2
1922 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1923 cmp ebx, bpp
1924 mov [edi+ebx-1], al // Write back Raw(x);
1925 // mov does not affect flags; -1 to offset inc ebx
1926 jb davgrlp
1927 // get # of bytes to alignment
1928 mov diff, edi // take start of row
1929 add diff, ebx // add bpp
1930 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1931 and diff, 0xfffffff8 // mask to alignment boundary
1932 sub diff, edi // subtract from start ==> value ebx at alignment
1933 jz davggo
1934 // fix alignment
1935 // Compute the Raw value for the bytes upto the alignment boundary
1936 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1937 xor ecx, ecx
1938 davglp1:
1939 xor eax, eax
1940 mov cl, [esi + ebx] // load cl with Prior(x)
1941 mov al, [edx + ebx] // load al with Raw(x-bpp)
1942 add ax, cx
1943 inc ebx
1944 shr ax, 1 // divide by 2
1945 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1946 cmp ebx, diff // Check if at alignment boundary
1947 mov [edi+ebx-1], al // Write back Raw(x);
1948 // mov does not affect flags; -1 to offset inc ebx
1949 jb davglp1 // Repeat until at alignment boundary
1950 davggo:
1951 mov eax, FullLength
1952 mov ecx, eax
1953 sub eax, ebx // subtract alignment fix
1954 and eax, 0x00000007 // calc bytes over mult of 8
1955 sub ecx, eax // drop over bytes from original length
1956 mov MMXLength, ecx
1957 } // end _asm block
1958 // Now do the math for the rest of the row
1959 switch ( bpp )
1960 {
1961 case 3:
1962 {
1963 ActiveMask.use = 0x0000000000ffffff;
1964 ShiftBpp.use = 24; // == 3 * 8
1965 ShiftRem.use = 40; // == 64 - 24
1966 _asm {
1967 // Re-init address pointers and offset
1968 movq mm7, ActiveMask
1969 mov ebx, diff // ebx ==> x = offset to alignment boundary
1970 movq mm5, LBCarryMask
1971 mov edi, row // edi ==> Avg(x)
1972 movq mm4, HBClearMask
1973 mov esi, prev_row // esi ==> Prior(x)
1974 // PRIME the pump (load the first Raw(x-bpp) data set
1975 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1976 // (we correct position in loop below)
1977 davg3lp:
1978 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1979 // Add (Prev_row/2) to Average
1980 movq mm3, mm5
1981 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1982 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1983 movq mm6, mm7
1984 pand mm3, mm1 // get lsb for each prev_row byte
1985 psrlq mm1, 1 // divide prev_row bytes by 2
1986 pand mm1, mm4 // clear invalid bit 7 of each byte
1987 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1988 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1989 movq mm1, mm3 // now use mm1 for getting LBCarrys
1990 pand mm1, mm2 // get LBCarrys for each byte where both
1991 // lsb's were == 1 (Only valid for active group)
1992 psrlq mm2, 1 // divide raw bytes by 2
1993 pand mm2, mm4 // clear invalid bit 7 of each byte
1994 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1995 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
1996 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
1997 // byte
1998 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
1999 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2000 movq mm2, mm0 // mov updated Raws to mm2
2001 psllq mm2, ShiftBpp // shift data to position correctly
2002 movq mm1, mm3 // now use mm1 for getting LBCarrys
2003 pand mm1, mm2 // get LBCarrys for each byte where both
2004 // lsb's were == 1 (Only valid for active group)
2005 psrlq mm2, 1 // divide raw bytes by 2
2006 pand mm2, mm4 // clear invalid bit 7 of each byte
2007 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2008 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2009 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2010 // byte
2012 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2013 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2014 // bytes
2015 movq mm2, mm0 // mov updated Raws to mm2
2016 psllq mm2, ShiftBpp // shift data to position correctly
2017 // Data only needs to be shifted once here to
2018 // get the correct x-bpp offset.
2019 movq mm1, mm3 // now use mm1 for getting LBCarrys
2020 pand mm1, mm2 // get LBCarrys for each byte where both
2021 // lsb's were == 1 (Only valid for active group)
2022 psrlq mm2, 1 // divide raw bytes by 2
2023 pand mm2, mm4 // clear invalid bit 7 of each byte
2024 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2025 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2026 add ebx, 8
2027 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2028 // byte
2030 // Now ready to write back to memory
2031 movq [edi + ebx - 8], mm0
2032 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2033 cmp ebx, MMXLength
2034 movq mm2, mm0 // mov updated Raw(x) to mm2
2035 jb davg3lp
2036 } // end _asm block
2037 }
2038 break;
2040 case 6:
2041 case 4:
2042 case 7:
2043 case 5:
2044 {
2045 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2046 // appropriate inactive bytes
2047 ShiftBpp.use = bpp << 3;
2048 ShiftRem.use = 64 - ShiftBpp.use;
2049 _asm {
2050 movq mm4, HBClearMask
2051 // Re-init address pointers and offset
2052 mov ebx, diff // ebx ==> x = offset to alignment boundary
2053 // Load ActiveMask and clear all bytes except for 1st active group
2054 movq mm7, ActiveMask
2055 mov edi, row // edi ==> Avg(x)
2056 psrlq mm7, ShiftRem
2057 mov esi, prev_row // esi ==> Prior(x)
2058 movq mm6, mm7
2059 movq mm5, LBCarryMask
2060 psllq mm6, ShiftBpp // Create mask for 2nd active group
2061 // PRIME the pump (load the first Raw(x-bpp) data set
2062 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2063 // (we correct position in loop below)
2064 davg4lp:
2065 movq mm0, [edi + ebx]
2066 psrlq mm2, ShiftRem // shift data to position correctly
2067 movq mm1, [esi + ebx]
2068 // Add (Prev_row/2) to Average
2069 movq mm3, mm5
2070 pand mm3, mm1 // get lsb for each prev_row byte
2071 psrlq mm1, 1 // divide prev_row bytes by 2
2072 pand mm1, mm4 // clear invalid bit 7 of each byte
2073 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2074 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2075 movq mm1, mm3 // now use mm1 for getting LBCarrys
2076 pand mm1, mm2 // get LBCarrys for each byte where both
2077 // lsb's were == 1 (Only valid for active group)
2078 psrlq mm2, 1 // divide raw bytes by 2
2079 pand mm2, mm4 // clear invalid bit 7 of each byte
2080 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2081 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2082 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2083 // byte
2084 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2085 movq mm2, mm0 // mov updated Raws to mm2
2086 psllq mm2, ShiftBpp // shift data to position correctly
2087 add ebx, 8
2088 movq mm1, mm3 // now use mm1 for getting LBCarrys
2089 pand mm1, mm2 // get LBCarrys for each byte where both
2090 // lsb's were == 1 (Only valid for active group)
2091 psrlq mm2, 1 // divide raw bytes by 2
2092 pand mm2, mm4 // clear invalid bit 7 of each byte
2093 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2094 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2095 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2096 // byte
2097 cmp ebx, MMXLength
2098 // Now ready to write back to memory
2099 movq [edi + ebx - 8], mm0
2100 // Prep Raw(x-bpp) for next loop
2101 movq mm2, mm0 // mov updated Raws to mm2
2102 jb davg4lp
2103 } // end _asm block
2104 }
2105 break;
2106 case 2:
2107 {
2108 ActiveMask.use = 0x000000000000ffff;
2109 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2110 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2111 _asm {
2112 // Load ActiveMask
2113 movq mm7, ActiveMask
2114 // Re-init address pointers and offset
2115 mov ebx, diff // ebx ==> x = offset to alignment boundary
2116 movq mm5, LBCarryMask
2117 mov edi, row // edi ==> Avg(x)
2118 movq mm4, HBClearMask
2119 mov esi, prev_row // esi ==> Prior(x)
2120 // PRIME the pump (load the first Raw(x-bpp) data set
2121 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2122 // (we correct position in loop below)
2123 davg2lp:
2124 movq mm0, [edi + ebx]
2125 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2126 movq mm1, [esi + ebx]
2127 // Add (Prev_row/2) to Average
2128 movq mm3, mm5
2129 pand mm3, mm1 // get lsb for each prev_row byte
2130 psrlq mm1, 1 // divide prev_row bytes by 2
2131 pand mm1, mm4 // clear invalid bit 7 of each byte
2132 movq mm6, mm7
2133 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2134 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2135 movq mm1, mm3 // now use mm1 for getting LBCarrys
2136 pand mm1, mm2 // get LBCarrys for each byte where both
2137 // lsb's were == 1 (Only valid for active group)
2138 psrlq mm2, 1 // divide raw bytes by 2
2139 pand mm2, mm4 // clear invalid bit 7 of each byte
2140 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2141 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2142 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2143 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2144 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2145 movq mm2, mm0 // mov updated Raws to mm2
2146 psllq mm2, ShiftBpp // shift data to position correctly
2147 movq mm1, mm3 // now use mm1 for getting LBCarrys
2148 pand mm1, mm2 // get LBCarrys for each byte where both
2149 // lsb's were == 1 (Only valid for active group)
2150 psrlq mm2, 1 // divide raw bytes by 2
2151 pand mm2, mm4 // clear invalid bit 7 of each byte
2152 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2153 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2154 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2156 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2157 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2158 movq mm2, mm0 // mov updated Raws to mm2
2159 psllq mm2, ShiftBpp // shift data to position correctly
2160 // Data only needs to be shifted once here to
2161 // get the correct x-bpp offset.
2162 movq mm1, mm3 // now use mm1 for getting LBCarrys
2163 pand mm1, mm2 // get LBCarrys for each byte where both
2164 // lsb's were == 1 (Only valid for active group)
2165 psrlq mm2, 1 // divide raw bytes by 2
2166 pand mm2, mm4 // clear invalid bit 7 of each byte
2167 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2168 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2169 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2171 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2172 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2173 movq mm2, mm0 // mov updated Raws to mm2
2174 psllq mm2, ShiftBpp // shift data to position correctly
2175 // Data only needs to be shifted once here to
2176 // get the correct x-bpp offset.
2177 add ebx, 8
2178 movq mm1, mm3 // now use mm1 for getting LBCarrys
2179 pand mm1, mm2 // get LBCarrys for each byte where both
2180 // lsb's were == 1 (Only valid for active group)
2181 psrlq mm2, 1 // divide raw bytes by 2
2182 pand mm2, mm4 // clear invalid bit 7 of each byte
2183 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2184 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2185 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2187 cmp ebx, MMXLength
2188 // Now ready to write back to memory
2189 movq [edi + ebx - 8], mm0
2190 // Prep Raw(x-bpp) for next loop
2191 movq mm2, mm0 // mov updated Raws to mm2
2192 jb davg2lp
2193 } // end _asm block
2194 }
2195 break;
2197 case 1: // bpp == 1
2198 {
2199 _asm {
2200 // Re-init address pointers and offset
2201 mov ebx, diff // ebx ==> x = offset to alignment boundary
2202 mov edi, row // edi ==> Avg(x)
2203 cmp ebx, FullLength // Test if offset at end of array
2204 jnb davg1end
2205 // Do Paeth decode for remaining bytes
2206 mov esi, prev_row // esi ==> Prior(x)
2207 mov edx, edi
2208 xor ecx, ecx // zero ecx before using cl & cx in loop below
2209 sub edx, bpp // edx ==> Raw(x-bpp)
2210 davg1lp:
2211 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2212 xor eax, eax
2213 mov cl, [esi + ebx] // load cl with Prior(x)
2214 mov al, [edx + ebx] // load al with Raw(x-bpp)
2215 add ax, cx
2216 inc ebx
2217 shr ax, 1 // divide by 2
2218 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2219 cmp ebx, FullLength // Check if at end of array
2220 mov [edi+ebx-1], al // Write back Raw(x);
2221 // mov does not affect flags; -1 to offset inc ebx
2222 jb davg1lp
2223 davg1end:
2224 } // end _asm block
2225 }
2226 return;
2228 case 8: // bpp == 8
2229 {
2230 _asm {
2231 // Re-init address pointers and offset
2232 mov ebx, diff // ebx ==> x = offset to alignment boundary
2233 movq mm5, LBCarryMask
2234 mov edi, row // edi ==> Avg(x)
2235 movq mm4, HBClearMask
2236 mov esi, prev_row // esi ==> Prior(x)
2237 // PRIME the pump (load the first Raw(x-bpp) data set
2238 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2239 // (NO NEED to correct position in loop below)
2240 davg8lp:
2241 movq mm0, [edi + ebx]
2242 movq mm3, mm5
2243 movq mm1, [esi + ebx]
2244 add ebx, 8
2245 pand mm3, mm1 // get lsb for each prev_row byte
2246 psrlq mm1, 1 // divide prev_row bytes by 2
2247 pand mm3, mm2 // get LBCarrys for each byte where both
2248 // lsb's were == 1
2249 psrlq mm2, 1 // divide raw bytes by 2
2250 pand mm1, mm4 // clear invalid bit 7 of each byte
2251 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2252 pand mm2, mm4 // clear invalid bit 7 of each byte
2253 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2254 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2255 cmp ebx, MMXLength
2256 movq [edi + ebx - 8], mm0
2257 movq mm2, mm0 // reuse as Raw(x-bpp)
2258 jb davg8lp
2259 } // end _asm block
2260 }
2261 break;
2262 default: // bpp greater than 8
2263 {
2264 _asm {
2265 movq mm5, LBCarryMask
2266 // Re-init address pointers and offset
2267 mov ebx, diff // ebx ==> x = offset to alignment boundary
2268 mov edi, row // edi ==> Avg(x)
2269 movq mm4, HBClearMask
2270 mov edx, edi
2271 mov esi, prev_row // esi ==> Prior(x)
2272 sub edx, bpp // edx ==> Raw(x-bpp)
2273 davgAlp:
2274 movq mm0, [edi + ebx]
2275 movq mm3, mm5
2276 movq mm1, [esi + ebx]
2277 pand mm3, mm1 // get lsb for each prev_row byte
2278 movq mm2, [edx + ebx]
2279 psrlq mm1, 1 // divide prev_row bytes by 2
2280 pand mm3, mm2 // get LBCarrys for each byte where both
2281 // lsb's were == 1
2282 psrlq mm2, 1 // divide raw bytes by 2
2283 pand mm1, mm4 // clear invalid bit 7 of each byte
2284 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2285 pand mm2, mm4 // clear invalid bit 7 of each byte
2286 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2287 add ebx, 8
2288 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2289 cmp ebx, MMXLength
2290 movq [edi + ebx - 8], mm0
2291 jb davgAlp
2292 } // end _asm block
2293 }
2294 break;
2295 } // end switch ( bpp )
2297 _asm {
2298 // MMX acceleration complete now do clean-up
2299 // Check if any remaining bytes left to decode
2300 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2301 mov edi, row // edi ==> Avg(x)
2302 cmp ebx, FullLength // Test if offset at end of array
2303 jnb davgend
2304 // Do Paeth decode for remaining bytes
2305 mov esi, prev_row // esi ==> Prior(x)
2306 mov edx, edi
2307 xor ecx, ecx // zero ecx before using cl & cx in loop below
2308 sub edx, bpp // edx ==> Raw(x-bpp)
2309 davglp2:
2310 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2311 xor eax, eax
2312 mov cl, [esi + ebx] // load cl with Prior(x)
2313 mov al, [edx + ebx] // load al with Raw(x-bpp)
2314 add ax, cx
2315 inc ebx
2316 shr ax, 1 // divide by 2
2317 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2318 cmp ebx, FullLength // Check if at end of array
2319 mov [edi+ebx-1], al // Write back Raw(x);
2320 // mov does not affect flags; -1 to offset inc ebx
2321 jb davglp2
2322 davgend:
2323 emms // End MMX instructions; prep for possible FP instrs.
2324 } // end _asm block
2325 }
2327 // Optimized code for PNG Paeth filter decoder
2328 void /* PRIVATE */
2329 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2330 png_bytep prev_row)
2331 {
2332 png_uint_32 FullLength;
2333 png_uint_32 MMXLength;
2334 //png_uint_32 len;
2335 int bpp;
2336 int diff;
2337 //int ptemp;
2338 int patemp, pbtemp, pctemp;
2340 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2341 FullLength = row_info->rowbytes; // # of bytes to filter
2342 _asm
2343 {
2344 xor ebx, ebx // ebx ==> x offset
2345 mov edi, row
2346 xor edx, edx // edx ==> x-bpp offset
2347 mov esi, prev_row
2348 xor eax, eax
2350 // Compute the Raw value for the first bpp bytes
2351 // Note: the formula works out to be always
2352 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2353 dpthrlp:
2354 mov al, [edi + ebx]
2355 add al, [esi + ebx]
2356 inc ebx
2357 cmp ebx, bpp
2358 mov [edi + ebx - 1], al
2359 jb dpthrlp
2360 // get # of bytes to alignment
2361 mov diff, edi // take start of row
2362 add diff, ebx // add bpp
2363 xor ecx, ecx
2364 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2365 and diff, 0xfffffff8 // mask to alignment boundary
2366 sub diff, edi // subtract from start ==> value ebx at alignment
2367 jz dpthgo
2368 // fix alignment
2369 dpthlp1:
2370 xor eax, eax
2371 // pav = p - a = (a + b - c) - a = b - c
2372 mov al, [esi + ebx] // load Prior(x) into al
2373 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2374 sub eax, ecx // subtract Prior(x-bpp)
2375 mov patemp, eax // Save pav for later use
2376 xor eax, eax
2377 // pbv = p - b = (a + b - c) - b = a - c
2378 mov al, [edi + edx] // load Raw(x-bpp) into al
2379 sub eax, ecx // subtract Prior(x-bpp)
2380 mov ecx, eax
2381 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2382 add eax, patemp // pcv = pav + pbv
2383 // pc = abs(pcv)
2384 test eax, 0x80000000
2385 jz dpthpca
2386 neg eax // reverse sign of neg values
2387 dpthpca:
2388 mov pctemp, eax // save pc for later use
2389 // pb = abs(pbv)
2390 test ecx, 0x80000000
2391 jz dpthpba
2392 neg ecx // reverse sign of neg values
2393 dpthpba:
2394 mov pbtemp, ecx // save pb for later use
2395 // pa = abs(pav)
2396 mov eax, patemp
2397 test eax, 0x80000000
2398 jz dpthpaa
2399 neg eax // reverse sign of neg values
2400 dpthpaa:
2401 mov patemp, eax // save pa for later use
2402 // test if pa <= pb
2403 cmp eax, ecx
2404 jna dpthabb
2405 // pa > pb; now test if pb <= pc
2406 cmp ecx, pctemp
2407 jna dpthbbc
2408 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2409 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2410 jmp dpthpaeth
2411 dpthbbc:
2412 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2413 mov cl, [esi + ebx] // load Prior(x) into cl
2414 jmp dpthpaeth
2415 dpthabb:
2416 // pa <= pb; now test if pa <= pc
2417 cmp eax, pctemp
2418 jna dpthabc
2419 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2420 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2421 jmp dpthpaeth
2422 dpthabc:
2423 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2424 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2425 dpthpaeth:
2426 inc ebx
2427 inc edx
2428 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2429 add [edi + ebx - 1], cl
2430 cmp ebx, diff
2431 jb dpthlp1
2432 dpthgo:
2433 mov ecx, FullLength
2434 mov eax, ecx
2435 sub eax, ebx // subtract alignment fix
2436 and eax, 0x00000007 // calc bytes over mult of 8
2437 sub ecx, eax // drop over bytes from original length
2438 mov MMXLength, ecx
2439 } // end _asm block
2440 // Now do the math for the rest of the row
2441 switch ( bpp )
2442 {
2443 case 3:
2444 {
2445 ActiveMask.use = 0x0000000000ffffff;
2446 ActiveMaskEnd.use = 0xffff000000000000;
2447 ShiftBpp.use = 24; // == bpp(3) * 8
2448 ShiftRem.use = 40; // == 64 - 24
2449 _asm
2450 {
2451 mov ebx, diff
2452 mov edi, row
2453 mov esi, prev_row
2454 pxor mm0, mm0
2455 // PRIME the pump (load the first Raw(x-bpp) data set
2456 movq mm1, [edi+ebx-8]
2457 dpth3lp:
2458 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2459 movq mm2, [esi + ebx] // load b=Prior(x)
2460 punpcklbw mm1, mm0 // Unpack High bytes of a
2461 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2462 punpcklbw mm2, mm0 // Unpack High bytes of b
2463 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2464 // pav = p - a = (a + b - c) - a = b - c
2465 movq mm4, mm2
2466 punpcklbw mm3, mm0 // Unpack High bytes of c
2467 // pbv = p - b = (a + b - c) - b = a - c
2468 movq mm5, mm1
2469 psubw mm4, mm3
2470 pxor mm7, mm7
2471 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2472 movq mm6, mm4
2473 psubw mm5, mm3
2475 // pa = abs(p-a) = abs(pav)
2476 // pb = abs(p-b) = abs(pbv)
2477 // pc = abs(p-c) = abs(pcv)
2478 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2479 paddw mm6, mm5
2480 pand mm0, mm4 // Only pav bytes < 0 in mm7
2481 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2482 psubw mm4, mm0
2483 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2484 psubw mm4, mm0
2485 psubw mm5, mm7
2486 pxor mm0, mm0
2487 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2488 pand mm0, mm6 // Only pav bytes < 0 in mm7
2489 psubw mm5, mm7
2490 psubw mm6, mm0
2491 // test pa <= pb
2492 movq mm7, mm4
2493 psubw mm6, mm0
2494 pcmpgtw mm7, mm5 // pa > pb?
2495 movq mm0, mm7
2496 // use mm7 mask to merge pa & pb
2497 pand mm5, mm7
2498 // use mm0 mask copy to merge a & b
2499 pand mm2, mm0
2500 pandn mm7, mm4
2501 pandn mm0, mm1
2502 paddw mm7, mm5
2503 paddw mm0, mm2
2504 // test ((pa <= pb)? pa:pb) <= pc
2505 pcmpgtw mm7, mm6 // pab > pc?
2506 pxor mm1, mm1
2507 pand mm3, mm7
2508 pandn mm7, mm0
2509 paddw mm7, mm3
2510 pxor mm0, mm0
2511 packuswb mm7, mm1
2512 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2513 pand mm7, ActiveMask
2514 movq mm2, mm3 // load b=Prior(x) step 1
2515 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2516 punpcklbw mm3, mm0 // Unpack High bytes of c
2517 movq [edi + ebx], mm7 // write back updated value
2518 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2519 // Now do Paeth for 2nd set of bytes (3-5)
2520 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2521 punpcklbw mm1, mm0 // Unpack High bytes of a
2522 pxor mm7, mm7
2523 punpcklbw mm2, mm0 // Unpack High bytes of b
2524 // pbv = p - b = (a + b - c) - b = a - c
2525 movq mm5, mm1
2526 // pav = p - a = (a + b - c) - a = b - c
2527 movq mm4, mm2
2528 psubw mm5, mm3
2529 psubw mm4, mm3
2530 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2531 // pav + pbv = pbv + pav
2532 movq mm6, mm5
2533 paddw mm6, mm4
2535 // pa = abs(p-a) = abs(pav)
2536 // pb = abs(p-b) = abs(pbv)
2537 // pc = abs(p-c) = abs(pcv)
2538 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2539 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2540 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2541 pand mm7, mm4 // Only pav bytes < 0 in mm7
2542 psubw mm5, mm0
2543 psubw mm4, mm7
2544 psubw mm5, mm0
2545 psubw mm4, mm7
2546 pxor mm0, mm0
2547 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2548 pand mm0, mm6 // Only pav bytes < 0 in mm7
2549 psubw mm6, mm0
2550 // test pa <= pb
2551 movq mm7, mm4
2552 psubw mm6, mm0
2553 pcmpgtw mm7, mm5 // pa > pb?
2554 movq mm0, mm7
2555 // use mm7 mask to merge pa & pb
2556 pand mm5, mm7
2557 // use mm0 mask copy to merge a & b
2558 pand mm2, mm0
2559 pandn mm7, mm4
2560 pandn mm0, mm1
2561 paddw mm7, mm5
2562 paddw mm0, mm2
2563 // test ((pa <= pb)? pa:pb) <= pc
2564 pcmpgtw mm7, mm6 // pab > pc?
2565 movq mm2, [esi + ebx] // load b=Prior(x)
2566 pand mm3, mm7
2567 pandn mm7, mm0
2568 pxor mm1, mm1
2569 paddw mm7, mm3
2570 pxor mm0, mm0
2571 packuswb mm7, mm1
2572 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2573 pand mm7, ActiveMask
2574 punpckhbw mm2, mm0 // Unpack High bytes of b
2575 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2576 // pav = p - a = (a + b - c) - a = b - c
2577 movq mm4, mm2
2578 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2579 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2580 movq [edi + ebx], mm7 // write back updated value
2581 movq mm1, mm7
2582 punpckhbw mm3, mm0 // Unpack High bytes of c
2583 psllq mm1, ShiftBpp // Shift bytes
2584 // Now mm1 will be used as Raw(x-bpp)
2585 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2586 pxor mm7, mm7
2587 punpckhbw mm1, mm0 // Unpack High bytes of a
2588 psubw mm4, mm3
2589 // pbv = p - b = (a + b - c) - b = a - c
2590 movq mm5, mm1
2591 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2592 movq mm6, mm4
2593 psubw mm5, mm3
2594 pxor mm0, mm0
2595 paddw mm6, mm5
2597 // pa = abs(p-a) = abs(pav)
2598 // pb = abs(p-b) = abs(pbv)
2599 // pc = abs(p-c) = abs(pcv)
2600 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2601 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2602 pand mm0, mm4 // Only pav bytes < 0 in mm7
2603 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2604 psubw mm4, mm0
2605 psubw mm5, mm7
2606 psubw mm4, mm0
2607 psubw mm5, mm7
2608 pxor mm0, mm0
2609 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2610 pand mm0, mm6 // Only pav bytes < 0 in mm7
2611 psubw mm6, mm0
2612 // test pa <= pb
2613 movq mm7, mm4
2614 psubw mm6, mm0
2615 pcmpgtw mm7, mm5 // pa > pb?
2616 movq mm0, mm7
2617 // use mm0 mask copy to merge a & b
2618 pand mm2, mm0
2619 // use mm7 mask to merge pa & pb
2620 pand mm5, mm7
2621 pandn mm0, mm1
2622 pandn mm7, mm4
2623 paddw mm0, mm2
2624 paddw mm7, mm5
2625 // test ((pa <= pb)? pa:pb) <= pc
2626 pcmpgtw mm7, mm6 // pab > pc?
2627 pand mm3, mm7
2628 pandn mm7, mm0
2629 paddw mm7, mm3
2630 pxor mm1, mm1
2631 packuswb mm1, mm7
2632 // Step ebx to next set of 8 bytes and repeat loop til done
2633 add ebx, 8
2634 pand mm1, ActiveMaskEnd
2635 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2637 cmp ebx, MMXLength
2638 pxor mm0, mm0 // pxor does not affect flags
2639 movq [edi + ebx - 8], mm1 // write back updated value
2640 // mm1 will be used as Raw(x-bpp) next loop
2641 // mm3 ready to be used as Prior(x-bpp) next loop
2642 jb dpth3lp
2643 } // end _asm block
2644 }
2645 break;
2647 case 6:
2648 case 7:
2649 case 5:
2650 {
2651 ActiveMask.use = 0x00000000ffffffff;
2652 ActiveMask2.use = 0xffffffff00000000;
2653 ShiftBpp.use = bpp << 3; // == bpp * 8
2654 ShiftRem.use = 64 - ShiftBpp.use;
2655 _asm
2656 {
2657 mov ebx, diff
2658 mov edi, row
2659 mov esi, prev_row
2660 // PRIME the pump (load the first Raw(x-bpp) data set
2661 movq mm1, [edi+ebx-8]
2662 pxor mm0, mm0
2663 dpth6lp:
2664 // Must shift to position Raw(x-bpp) data
2665 psrlq mm1, ShiftRem
2666 // Do first set of 4 bytes
2667 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2668 punpcklbw mm1, mm0 // Unpack Low bytes of a
2669 movq mm2, [esi + ebx] // load b=Prior(x)
2670 punpcklbw mm2, mm0 // Unpack Low bytes of b
2671 // Must shift to position Prior(x-bpp) data
2672 psrlq mm3, ShiftRem
2673 // pav = p - a = (a + b - c) - a = b - c
2674 movq mm4, mm2
2675 punpcklbw mm3, mm0 // Unpack Low bytes of c
2676 // pbv = p - b = (a + b - c) - b = a - c
2677 movq mm5, mm1
2678 psubw mm4, mm3
2679 pxor mm7, mm7
2680 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2681 movq mm6, mm4
2682 psubw mm5, mm3
2683 // pa = abs(p-a) = abs(pav)
2684 // pb = abs(p-b) = abs(pbv)
2685 // pc = abs(p-c) = abs(pcv)
2686 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2687 paddw mm6, mm5
2688 pand mm0, mm4 // Only pav bytes < 0 in mm7
2689 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2690 psubw mm4, mm0
2691 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2692 psubw mm4, mm0
2693 psubw mm5, mm7
2694 pxor mm0, mm0
2695 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2696 pand mm0, mm6 // Only pav bytes < 0 in mm7
2697 psubw mm5, mm7
2698 psubw mm6, mm0
2699 // test pa <= pb
2700 movq mm7, mm4
2701 psubw mm6, mm0
2702 pcmpgtw mm7, mm5 // pa > pb?
2703 movq mm0, mm7
2704 // use mm7 mask to merge pa & pb
2705 pand mm5, mm7
2706 // use mm0 mask copy to merge a & b
2707 pand mm2, mm0
2708 pandn mm7, mm4
2709 pandn mm0, mm1
2710 paddw mm7, mm5
2711 paddw mm0, mm2
2712 // test ((pa <= pb)? pa:pb) <= pc
2713 pcmpgtw mm7, mm6 // pab > pc?
2714 pxor mm1, mm1
2715 pand mm3, mm7
2716 pandn mm7, mm0
2717 paddw mm7, mm3
2718 pxor mm0, mm0
2719 packuswb mm7, mm1
2720 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2721 pand mm7, ActiveMask
2722 psrlq mm3, ShiftRem
2723 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2724 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2725 movq mm6, mm2
2726 movq [edi + ebx], mm7 // write back updated value
2727 movq mm1, [edi+ebx-8]
2728 psllq mm6, ShiftBpp
2729 movq mm5, mm7
2730 psrlq mm1, ShiftRem
2731 por mm3, mm6
2732 psllq mm5, ShiftBpp
2733 punpckhbw mm3, mm0 // Unpack High bytes of c
2734 por mm1, mm5
2735 // Do second set of 4 bytes
2736 punpckhbw mm2, mm0 // Unpack High bytes of b
2737 punpckhbw mm1, mm0 // Unpack High bytes of a
2738 // pav = p - a = (a + b - c) - a = b - c
2739 movq mm4, mm2
2740 // pbv = p - b = (a + b - c) - b = a - c
2741 movq mm5, mm1
2742 psubw mm4, mm3
2743 pxor mm7, mm7
2744 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2745 movq mm6, mm4
2746 psubw mm5, mm3
2747 // pa = abs(p-a) = abs(pav)
2748 // pb = abs(p-b) = abs(pbv)
2749 // pc = abs(p-c) = abs(pcv)
2750 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2751 paddw mm6, mm5
2752 pand mm0, mm4 // Only pav bytes < 0 in mm7
2753 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2754 psubw mm4, mm0
2755 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2756 psubw mm4, mm0
2757 psubw mm5, mm7
2758 pxor mm0, mm0
2759 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2760 pand mm0, mm6 // Only pav bytes < 0 in mm7
2761 psubw mm5, mm7
2762 psubw mm6, mm0
2763 // test pa <= pb
2764 movq mm7, mm4
2765 psubw mm6, mm0
2766 pcmpgtw mm7, mm5 // pa > pb?
2767 movq mm0, mm7
2768 // use mm7 mask to merge pa & pb
2769 pand mm5, mm7
2770 // use mm0 mask copy to merge a & b
2771 pand mm2, mm0
2772 pandn mm7, mm4
2773 pandn mm0, mm1
2774 paddw mm7, mm5
2775 paddw mm0, mm2
2776 // test ((pa <= pb)? pa:pb) <= pc
2777 pcmpgtw mm7, mm6 // pab > pc?
2778 pxor mm1, mm1
2779 pand mm3, mm7
2780 pandn mm7, mm0
2781 pxor mm1, mm1
2782 paddw mm7, mm3
2783 pxor mm0, mm0
2784 // Step ex to next set of 8 bytes and repeat loop til done
2785 add ebx, 8
2786 packuswb mm1, mm7
2787 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2788 cmp ebx, MMXLength
2789 movq [edi + ebx - 8], mm1 // write back updated value
2790 // mm1 will be used as Raw(x-bpp) next loop
2791 jb dpth6lp
2792 } // end _asm block
2793 }
2794 break;
2796 case 4:
2797 {
2798 ActiveMask.use = 0x00000000ffffffff;
2799 _asm {
2800 mov ebx, diff
2801 mov edi, row
2802 mov esi, prev_row
2803 pxor mm0, mm0
2804 // PRIME the pump (load the first Raw(x-bpp) data set
2805 movq mm1, [edi+ebx-8] // Only time should need to read
2806 // a=Raw(x-bpp) bytes
2807 dpth4lp:
2808 // Do first set of 4 bytes
2809 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2810 punpckhbw mm1, mm0 // Unpack Low bytes of a
2811 movq mm2, [esi + ebx] // load b=Prior(x)
2812 punpcklbw mm2, mm0 // Unpack High bytes of b
2813 // pav = p - a = (a + b - c) - a = b - c
2814 movq mm4, mm2
2815 punpckhbw mm3, mm0 // Unpack High bytes of c
2816 // pbv = p - b = (a + b - c) - b = a - c
2817 movq mm5, mm1
2818 psubw mm4, mm3
2819 pxor mm7, mm7
2820 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2821 movq mm6, mm4
2822 psubw mm5, mm3
2823 // pa = abs(p-a) = abs(pav)
2824 // pb = abs(p-b) = abs(pbv)
2825 // pc = abs(p-c) = abs(pcv)
2826 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2827 paddw mm6, mm5
2828 pand mm0, mm4 // Only pav bytes < 0 in mm7
2829 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2830 psubw mm4, mm0
2831 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2832 psubw mm4, mm0
2833 psubw mm5, mm7
2834 pxor mm0, mm0
2835 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2836 pand mm0, mm6 // Only pav bytes < 0 in mm7
2837 psubw mm5, mm7
2838 psubw mm6, mm0
2839 // test pa <= pb
2840 movq mm7, mm4
2841 psubw mm6, mm0
2842 pcmpgtw mm7, mm5 // pa > pb?
2843 movq mm0, mm7
2844 // use mm7 mask to merge pa & pb
2845 pand mm5, mm7
2846 // use mm0 mask copy to merge a & b
2847 pand mm2, mm0
2848 pandn mm7, mm4
2849 pandn mm0, mm1
2850 paddw mm7, mm5
2851 paddw mm0, mm2
2852 // test ((pa <= pb)? pa:pb) <= pc
2853 pcmpgtw mm7, mm6 // pab > pc?
2854 pxor mm1, mm1
2855 pand mm3, mm7
2856 pandn mm7, mm0
2857 paddw mm7, mm3
2858 pxor mm0, mm0
2859 packuswb mm7, mm1
2860 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2861 pand mm7, ActiveMask
2862 movq mm2, mm3 // load b=Prior(x) step 1
2863 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2864 punpcklbw mm3, mm0 // Unpack High bytes of c
2865 movq [edi + ebx], mm7 // write back updated value
2866 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2867 // Do second set of 4 bytes
2868 punpckhbw mm2, mm0 // Unpack Low bytes of b
2869 punpcklbw mm1, mm0 // Unpack Low bytes of a
2870 // pav = p - a = (a + b - c) - a = b - c
2871 movq mm4, mm2
2872 // pbv = p - b = (a + b - c) - b = a - c
2873 movq mm5, mm1
2874 psubw mm4, mm3
2875 pxor mm7, mm7
2876 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2877 movq mm6, mm4
2878 psubw mm5, mm3
2879 // pa = abs(p-a) = abs(pav)
2880 // pb = abs(p-b) = abs(pbv)
2881 // pc = abs(p-c) = abs(pcv)
2882 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2883 paddw mm6, mm5
2884 pand mm0, mm4 // Only pav bytes < 0 in mm7
2885 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2886 psubw mm4, mm0
2887 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2888 psubw mm4, mm0
2889 psubw mm5, mm7
2890 pxor mm0, mm0
2891 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2892 pand mm0, mm6 // Only pav bytes < 0 in mm7
2893 psubw mm5, mm7
2894 psubw mm6, mm0
2895 // test pa <= pb
2896 movq mm7, mm4
2897 psubw mm6, mm0
2898 pcmpgtw mm7, mm5 // pa > pb?
2899 movq mm0, mm7
2900 // use mm7 mask to merge pa & pb
2901 pand mm5, mm7
2902 // use mm0 mask copy to merge a & b
2903 pand mm2, mm0
2904 pandn mm7, mm4
2905 pandn mm0, mm1
2906 paddw mm7, mm5
2907 paddw mm0, mm2
2908 // test ((pa <= pb)? pa:pb) <= pc
2909 pcmpgtw mm7, mm6 // pab > pc?
2910 pxor mm1, mm1
2911 pand mm3, mm7
2912 pandn mm7, mm0
2913 pxor mm1, mm1
2914 paddw mm7, mm3
2915 pxor mm0, mm0
2916 // Step ex to next set of 8 bytes and repeat loop til done
2917 add ebx, 8
2918 packuswb mm1, mm7
2919 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2920 cmp ebx, MMXLength
2921 movq [edi + ebx - 8], mm1 // write back updated value
2922 // mm1 will be used as Raw(x-bpp) next loop
2923 jb dpth4lp
2924 } // end _asm block
2925 }
2926 break;
2927 case 8: // bpp == 8
2928 {
2929 ActiveMask.use = 0x00000000ffffffff;
2930 _asm {
2931 mov ebx, diff
2932 mov edi, row
2933 mov esi, prev_row
2934 pxor mm0, mm0
2935 // PRIME the pump (load the first Raw(x-bpp) data set
2936 movq mm1, [edi+ebx-8] // Only time should need to read
2937 // a=Raw(x-bpp) bytes
2938 dpth8lp:
2939 // Do first set of 4 bytes
2940 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2941 punpcklbw mm1, mm0 // Unpack Low bytes of a
2942 movq mm2, [esi + ebx] // load b=Prior(x)
2943 punpcklbw mm2, mm0 // Unpack Low bytes of b
2944 // pav = p - a = (a + b - c) - a = b - c
2945 movq mm4, mm2
2946 punpcklbw mm3, mm0 // Unpack Low bytes of c
2947 // pbv = p - b = (a + b - c) - b = a - c
2948 movq mm5, mm1
2949 psubw mm4, mm3
2950 pxor mm7, mm7
2951 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2952 movq mm6, mm4
2953 psubw mm5, mm3
2954 // pa = abs(p-a) = abs(pav)
2955 // pb = abs(p-b) = abs(pbv)
2956 // pc = abs(p-c) = abs(pcv)
2957 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2958 paddw mm6, mm5
2959 pand mm0, mm4 // Only pav bytes < 0 in mm7
2960 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2961 psubw mm4, mm0
2962 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2963 psubw mm4, mm0
2964 psubw mm5, mm7
2965 pxor mm0, mm0
2966 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2967 pand mm0, mm6 // Only pav bytes < 0 in mm7
2968 psubw mm5, mm7
2969 psubw mm6, mm0
2970 // test pa <= pb
2971 movq mm7, mm4
2972 psubw mm6, mm0
2973 pcmpgtw mm7, mm5 // pa > pb?
2974 movq mm0, mm7
2975 // use mm7 mask to merge pa & pb
2976 pand mm5, mm7
2977 // use mm0 mask copy to merge a & b
2978 pand mm2, mm0
2979 pandn mm7, mm4
2980 pandn mm0, mm1
2981 paddw mm7, mm5
2982 paddw mm0, mm2
2983 // test ((pa <= pb)? pa:pb) <= pc
2984 pcmpgtw mm7, mm6 // pab > pc?
2985 pxor mm1, mm1
2986 pand mm3, mm7
2987 pandn mm7, mm0
2988 paddw mm7, mm3
2989 pxor mm0, mm0
2990 packuswb mm7, mm1
2991 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2992 pand mm7, ActiveMask
2993 movq mm2, [esi + ebx] // load b=Prior(x)
2994 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2995 punpckhbw mm3, mm0 // Unpack High bytes of c
2996 movq [edi + ebx], mm7 // write back updated value
2997 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
2999 // Do second set of 4 bytes
3000 punpckhbw mm2, mm0 // Unpack High bytes of b
3001 punpckhbw mm1, mm0 // Unpack High bytes of a
3002 // pav = p - a = (a + b - c) - a = b - c
3003 movq mm4, mm2
3004 // pbv = p - b = (a + b - c) - b = a - c
3005 movq mm5, mm1
3006 psubw mm4, mm3
3007 pxor mm7, mm7
3008 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3009 movq mm6, mm4
3010 psubw mm5, mm3
3011 // pa = abs(p-a) = abs(pav)
3012 // pb = abs(p-b) = abs(pbv)
3013 // pc = abs(p-c) = abs(pcv)
3014 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3015 paddw mm6, mm5
3016 pand mm0, mm4 // Only pav bytes < 0 in mm7
3017 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3018 psubw mm4, mm0
3019 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3020 psubw mm4, mm0
3021 psubw mm5, mm7
3022 pxor mm0, mm0
3023 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3024 pand mm0, mm6 // Only pav bytes < 0 in mm7
3025 psubw mm5, mm7
3026 psubw mm6, mm0
3027 // test pa <= pb
3028 movq mm7, mm4
3029 psubw mm6, mm0
3030 pcmpgtw mm7, mm5 // pa > pb?
3031 movq mm0, mm7
3032 // use mm7 mask to merge pa & pb
3033 pand mm5, mm7
3034 // use mm0 mask copy to merge a & b
3035 pand mm2, mm0
3036 pandn mm7, mm4
3037 pandn mm0, mm1
3038 paddw mm7, mm5
3039 paddw mm0, mm2
3040 // test ((pa <= pb)? pa:pb) <= pc
3041 pcmpgtw mm7, mm6 // pab > pc?
3042 pxor mm1, mm1
3043 pand mm3, mm7
3044 pandn mm7, mm0
3045 pxor mm1, mm1
3046 paddw mm7, mm3
3047 pxor mm0, mm0
3048 // Step ex to next set of 8 bytes and repeat loop til done
3049 add ebx, 8
3050 packuswb mm1, mm7
3051 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3052 cmp ebx, MMXLength
3053 movq [edi + ebx - 8], mm1 // write back updated value
3054 // mm1 will be used as Raw(x-bpp) next loop
3055 jb dpth8lp
3056 } // end _asm block
3057 }
3058 break;
3060 case 1: // bpp = 1
3061 case 2: // bpp = 2
3062 default: // bpp > 8
3063 {
3064 _asm {
3065 mov ebx, diff
3066 cmp ebx, FullLength
3067 jnb dpthdend
3068 mov edi, row
3069 mov esi, prev_row
3070 // Do Paeth decode for remaining bytes
3071 mov edx, ebx
3072 xor ecx, ecx // zero ecx before using cl & cx in loop below
3073 sub edx, bpp // Set edx = ebx - bpp
3074 dpthdlp:
3075 xor eax, eax
3076 // pav = p - a = (a + b - c) - a = b - c
3077 mov al, [esi + ebx] // load Prior(x) into al
3078 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3079 sub eax, ecx // subtract Prior(x-bpp)
3080 mov patemp, eax // Save pav for later use
3081 xor eax, eax
3082 // pbv = p - b = (a + b - c) - b = a - c
3083 mov al, [edi + edx] // load Raw(x-bpp) into al
3084 sub eax, ecx // subtract Prior(x-bpp)
3085 mov ecx, eax
3086 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3087 add eax, patemp // pcv = pav + pbv
3088 // pc = abs(pcv)
3089 test eax, 0x80000000
3090 jz dpthdpca
3091 neg eax // reverse sign of neg values
3092 dpthdpca:
3093 mov pctemp, eax // save pc for later use
3094 // pb = abs(pbv)
3095 test ecx, 0x80000000
3096 jz dpthdpba
3097 neg ecx // reverse sign of neg values
3098 dpthdpba:
3099 mov pbtemp, ecx // save pb for later use
3100 // pa = abs(pav)
3101 mov eax, patemp
3102 test eax, 0x80000000
3103 jz dpthdpaa
3104 neg eax // reverse sign of neg values
3105 dpthdpaa:
3106 mov patemp, eax // save pa for later use
3107 // test if pa <= pb
3108 cmp eax, ecx
3109 jna dpthdabb
3110 // pa > pb; now test if pb <= pc
3111 cmp ecx, pctemp
3112 jna dpthdbbc
3113 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3114 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3115 jmp dpthdpaeth
3116 dpthdbbc:
3117 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3118 mov cl, [esi + ebx] // load Prior(x) into cl
3119 jmp dpthdpaeth
3120 dpthdabb:
3121 // pa <= pb; now test if pa <= pc
3122 cmp eax, pctemp
3123 jna dpthdabc
3124 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3125 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3126 jmp dpthdpaeth
3127 dpthdabc:
3128 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3129 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3130 dpthdpaeth:
3131 inc ebx
3132 inc edx
3133 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3134 add [edi + ebx - 1], cl
3135 cmp ebx, FullLength
3136 jb dpthdlp
3137 dpthdend:
3138 } // end _asm block
3139 }
3140 return; // No need to go further with this one
3141 } // end switch ( bpp )
3142 _asm
3143 {
3144 // MMX acceleration complete now do clean-up
3145 // Check if any remaining bytes left to decode
3146 mov ebx, MMXLength
3147 cmp ebx, FullLength
3148 jnb dpthend
3149 mov edi, row
3150 mov esi, prev_row
3151 // Do Paeth decode for remaining bytes
3152 mov edx, ebx
3153 xor ecx, ecx // zero ecx before using cl & cx in loop below
3154 sub edx, bpp // Set edx = ebx - bpp
3155 dpthlp2:
3156 xor eax, eax
3157 // pav = p - a = (a + b - c) - a = b - c
3158 mov al, [esi + ebx] // load Prior(x) into al
3159 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3160 sub eax, ecx // subtract Prior(x-bpp)
3161 mov patemp, eax // Save pav for later use
3162 xor eax, eax
3163 // pbv = p - b = (a + b - c) - b = a - c
3164 mov al, [edi + edx] // load Raw(x-bpp) into al
3165 sub eax, ecx // subtract Prior(x-bpp)
3166 mov ecx, eax
3167 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3168 add eax, patemp // pcv = pav + pbv
3169 // pc = abs(pcv)
3170 test eax, 0x80000000
3171 jz dpthpca2
3172 neg eax // reverse sign of neg values
3173 dpthpca2:
3174 mov pctemp, eax // save pc for later use
3175 // pb = abs(pbv)
3176 test ecx, 0x80000000
3177 jz dpthpba2
3178 neg ecx // reverse sign of neg values
3179 dpthpba2:
3180 mov pbtemp, ecx // save pb for later use
3181 // pa = abs(pav)
3182 mov eax, patemp
3183 test eax, 0x80000000
3184 jz dpthpaa2
3185 neg eax // reverse sign of neg values
3186 dpthpaa2:
3187 mov patemp, eax // save pa for later use
3188 // test if pa <= pb
3189 cmp eax, ecx
3190 jna dpthabb2
3191 // pa > pb; now test if pb <= pc
3192 cmp ecx, pctemp
3193 jna dpthbbc2
3194 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3195 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3196 jmp dpthpaeth2
3197 dpthbbc2:
3198 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3199 mov cl, [esi + ebx] // load Prior(x) into cl
3200 jmp dpthpaeth2
3201 dpthabb2:
3202 // pa <= pb; now test if pa <= pc
3203 cmp eax, pctemp
3204 jna dpthabc2
3205 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3206 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3207 jmp dpthpaeth2
3208 dpthabc2:
3209 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3210 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3211 dpthpaeth2:
3212 inc ebx
3213 inc edx
3214 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3215 add [edi + ebx - 1], cl
3216 cmp ebx, FullLength
3217 jb dpthlp2
3218 dpthend:
3219 emms // End MMX instructions; prep for possible FP instrs.
3220 } // end _asm block
3221 }
3223 // Optimized code for PNG Sub filter decoder
3224 void /* PRIVATE */
3225 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3226 {
3227 //int test;
3228 int bpp;
3229 png_uint_32 FullLength;
3230 png_uint_32 MMXLength;
3231 int diff;
3233 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3234 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3235 _asm {
3236 mov edi, row
3237 mov esi, edi // lp = row
3238 add edi, bpp // rp = row + bpp
3239 xor eax, eax
3240 // get # of bytes to alignment
3241 mov diff, edi // take start of row
3242 add diff, 0xf // add 7 + 8 to incr past
3243 // alignment boundary
3244 xor ebx, ebx
3245 and diff, 0xfffffff8 // mask to alignment boundary
3246 sub diff, edi // subtract from start ==> value
3247 // ebx at alignment
3248 jz dsubgo
3249 // fix alignment
3250 dsublp1:
3251 mov al, [esi+ebx]
3252 add [edi+ebx], al
3253 inc ebx
3254 cmp ebx, diff
3255 jb dsublp1
3256 dsubgo:
3257 mov ecx, FullLength
3258 mov edx, ecx
3259 sub edx, ebx // subtract alignment fix
3260 and edx, 0x00000007 // calc bytes over mult of 8
3261 sub ecx, edx // drop over bytes from length
3262 mov MMXLength, ecx
3263 } // end _asm block
3265 // Now do the math for the rest of the row
3266 switch ( bpp )
3267 {
3268 case 3:
3269 {
3270 ActiveMask.use = 0x0000ffffff000000;
3271 ShiftBpp.use = 24; // == 3 * 8
3272 ShiftRem.use = 40; // == 64 - 24
3273 _asm {
3274 mov edi, row
3275 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3276 mov esi, edi // lp = row
3277 add edi, bpp // rp = row + bpp
3278 movq mm6, mm7
3279 mov ebx, diff
3280 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3281 // byte group
3282 // PRIME the pump (load the first Raw(x-bpp) data set
3283 movq mm1, [edi+ebx-8]
3284 dsub3lp:
3285 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3286 // no need for mask; shift clears inactive bytes
3287 // Add 1st active group
3288 movq mm0, [edi+ebx]
3289 paddb mm0, mm1
3290 // Add 2nd active group
3291 movq mm1, mm0 // mov updated Raws to mm1
3292 psllq mm1, ShiftBpp // shift data to position correctly
3293 pand mm1, mm7 // mask to use only 2nd active group
3294 paddb mm0, mm1
3295 // Add 3rd active group
3296 movq mm1, mm0 // mov updated Raws to mm1
3297 psllq mm1, ShiftBpp // shift data to position correctly
3298 pand mm1, mm6 // mask to use only 3rd active group
3299 add ebx, 8
3300 paddb mm0, mm1
3301 cmp ebx, MMXLength
3302 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3303 // Prep for doing 1st add at top of loop
3304 movq mm1, mm0
3305 jb dsub3lp
3306 } // end _asm block
3307 }
3308 break;
3310 case 1:
3311 {
3312 // Placed here just in case this is a duplicate of the
3313 // non-MMX code for the SUB filter in png_read_filter_row below
3314 //
3315 // png_bytep rp;
3316 // png_bytep lp;
3317 // png_uint_32 i;
3318 // bpp = (row_info->pixel_depth + 7) >> 3;
3319 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3320 // i < row_info->rowbytes; i++, rp++, lp++)
3321 // {
3322 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3323 // }
3324 _asm {
3325 mov ebx, diff
3326 mov edi, row
3327 cmp ebx, FullLength
3328 jnb dsub1end
3329 mov esi, edi // lp = row
3330 xor eax, eax
3331 add edi, bpp // rp = row + bpp
3332 dsub1lp:
3333 mov al, [esi+ebx]
3334 add [edi+ebx], al
3335 inc ebx
3336 cmp ebx, FullLength
3337 jb dsub1lp
3338 dsub1end:
3339 } // end _asm block
3340 }
3341 return;
3343 case 6:
3344 case 7:
3345 case 4:
3346 case 5:
3347 {
3348 ShiftBpp.use = bpp << 3;
3349 ShiftRem.use = 64 - ShiftBpp.use;
3350 _asm {
3351 mov edi, row
3352 mov ebx, diff
3353 mov esi, edi // lp = row
3354 add edi, bpp // rp = row + bpp
3355 // PRIME the pump (load the first Raw(x-bpp) data set
3356 movq mm1, [edi+ebx-8]
3357 dsub4lp:
3358 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3359 // no need for mask; shift clears inactive bytes
3360 movq mm0, [edi+ebx]
3361 paddb mm0, mm1
3362 // Add 2nd active group
3363 movq mm1, mm0 // mov updated Raws to mm1
3364 psllq mm1, ShiftBpp // shift data to position correctly
3365 // there is no need for any mask
3366 // since shift clears inactive bits/bytes
3367 add ebx, 8
3368 paddb mm0, mm1
3369 cmp ebx, MMXLength
3370 movq [edi+ebx-8], mm0
3371 movq mm1, mm0 // Prep for doing 1st add at top of loop
3372 jb dsub4lp
3373 } // end _asm block
3374 }
3375 break;
3377 case 2:
3378 {
3379 ActiveMask.use = 0x00000000ffff0000;
3380 ShiftBpp.use = 16; // == 2 * 8
3381 ShiftRem.use = 48; // == 64 - 16
3382 _asm {
3383 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3384 mov ebx, diff
3385 movq mm6, mm7
3386 mov edi, row
3387 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3388 // byte group
3389 mov esi, edi // lp = row
3390 movq mm5, mm6
3391 add edi, bpp // rp = row + bpp
3392 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3393 // byte group
3394 // PRIME the pump (load the first Raw(x-bpp) data set
3395 movq mm1, [edi+ebx-8]
3396 dsub2lp:
3397 // Add 1st active group
3398 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3399 // no need for mask; shift clears inactive
3400 // bytes
3401 movq mm0, [edi+ebx]
3402 paddb mm0, mm1
3403 // Add 2nd active group
3404 movq mm1, mm0 // mov updated Raws to mm1
3405 psllq mm1, ShiftBpp // shift data to position correctly
3406 pand mm1, mm7 // mask to use only 2nd active group
3407 paddb mm0, mm1
3408 // Add 3rd active group
3409 movq mm1, mm0 // mov updated Raws to mm1
3410 psllq mm1, ShiftBpp // shift data to position correctly
3411 pand mm1, mm6 // mask to use only 3rd active group
3412 paddb mm0, mm1
3413 // Add 4th active group
3414 movq mm1, mm0 // mov updated Raws to mm1
3415 psllq mm1, ShiftBpp // shift data to position correctly
3416 pand mm1, mm5 // mask to use only 4th active group
3417 add ebx, 8
3418 paddb mm0, mm1
3419 cmp ebx, MMXLength
3420 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3421 movq mm1, mm0 // Prep for doing 1st add at top of loop
3422 jb dsub2lp
3423 } // end _asm block
3424 }
3425 break;
3426 case 8:
3427 {
3428 _asm {
3429 mov edi, row
3430 mov ebx, diff
3431 mov esi, edi // lp = row
3432 add edi, bpp // rp = row + bpp
3433 mov ecx, MMXLength
3434 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3435 // Raw(x-bpp) data set
3436 and ecx, 0x0000003f // calc bytes over mult of 64
3437 dsub8lp:
3438 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3439 paddb mm0, mm7
3440 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3441 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3442 // Now mm0 will be used as Raw(x-bpp) for
3443 // the 2nd group of 8 bytes. This will be
3444 // repeated for each group of 8 bytes with
3445 // the 8th group being used as the Raw(x-bpp)
3446 // for the 1st group of the next loop.
3447 paddb mm1, mm0
3448 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3449 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3450 paddb mm2, mm1
3451 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3452 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3453 paddb mm3, mm2
3454 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3455 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3456 paddb mm4, mm3
3457 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3458 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3459 paddb mm5, mm4
3460 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3461 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3462 paddb mm6, mm5
3463 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3464 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3465 add ebx, 64
3466 paddb mm7, mm6
3467 cmp ebx, ecx
3468 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3469 jb dsub8lp
3470 cmp ebx, MMXLength
3471 jnb dsub8lt8
3472 dsub8lpA:
3473 movq mm0, [edi+ebx]
3474 add ebx, 8
3475 paddb mm0, mm7
3476 cmp ebx, MMXLength
3477 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3478 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3479 // be the new Raw(x-bpp) for the next loop
3480 jb dsub8lpA
3481 dsub8lt8:
3482 } // end _asm block
3483 }
3484 break;
3486 default: // bpp greater than 8 bytes
3487 {
3488 _asm {
3489 mov ebx, diff
3490 mov edi, row
3491 mov esi, edi // lp = row
3492 add edi, bpp // rp = row + bpp
3493 dsubAlp:
3494 movq mm0, [edi+ebx]
3495 movq mm1, [esi+ebx]
3496 add ebx, 8
3497 paddb mm0, mm1
3498 cmp ebx, MMXLength
3499 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3500 // add ebx
3501 jb dsubAlp
3502 } // end _asm block
3503 }
3504 break;
3506 } // end switch ( bpp )
3508 _asm {
3509 mov ebx, MMXLength
3510 mov edi, row
3511 cmp ebx, FullLength
3512 jnb dsubend
3513 mov esi, edi // lp = row
3514 xor eax, eax
3515 add edi, bpp // rp = row + bpp
3516 dsublp2:
3517 mov al, [esi+ebx]
3518 add [edi+ebx], al
3519 inc ebx
3520 cmp ebx, FullLength
3521 jb dsublp2
3522 dsubend:
3523 emms // End MMX instructions; prep for possible FP instrs.
3524 } // end _asm block
3525 }
3527 // Optimized code for PNG Up filter decoder
3528 void /* PRIVATE */
3529 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3530 png_bytep prev_row)
3531 {
3532 png_uint_32 len;
3533 len = row_info->rowbytes; // # of bytes to filter
3534 _asm {
3535 mov edi, row
3536 // get # of bytes to alignment
3537 mov ecx, edi
3538 xor ebx, ebx
3539 add ecx, 0x7
3540 xor eax, eax
3541 and ecx, 0xfffffff8
3542 mov esi, prev_row
3543 sub ecx, edi
3544 jz dupgo
3545 // fix alignment
3546 duplp1:
3547 mov al, [edi+ebx]
3548 add al, [esi+ebx]
3549 inc ebx
3550 cmp ebx, ecx
3551 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3552 jb duplp1
3553 dupgo:
3554 mov ecx, len
3555 mov edx, ecx
3556 sub edx, ebx // subtract alignment fix
3557 and edx, 0x0000003f // calc bytes over mult of 64
3558 sub ecx, edx // drop over bytes from length
3559 // Unrolled loop - use all MMX registers and interleave to reduce
3560 // number of branch instructions (loops) and reduce partial stalls
3561 duploop:
3562 movq mm1, [esi+ebx]
3563 movq mm0, [edi+ebx]
3564 movq mm3, [esi+ebx+8]
3565 paddb mm0, mm1
3566 movq mm2, [edi+ebx+8]
3567 movq [edi+ebx], mm0
3568 paddb mm2, mm3
3569 movq mm5, [esi+ebx+16]
3570 movq [edi+ebx+8], mm2
3571 movq mm4, [edi+ebx+16]
3572 movq mm7, [esi+ebx+24]
3573 paddb mm4, mm5
3574 movq mm6, [edi+ebx+24]
3575 movq [edi+ebx+16], mm4
3576 paddb mm6, mm7
3577 movq mm1, [esi+ebx+32]
3578 movq [edi+ebx+24], mm6
3579 movq mm0, [edi+ebx+32]
3580 movq mm3, [esi+ebx+40]
3581 paddb mm0, mm1
3582 movq mm2, [edi+ebx+40]
3583 movq [edi+ebx+32], mm0
3584 paddb mm2, mm3
3585 movq mm5, [esi+ebx+48]
3586 movq [edi+ebx+40], mm2
3587 movq mm4, [edi+ebx+48]
3588 movq mm7, [esi+ebx+56]
3589 paddb mm4, mm5
3590 movq mm6, [edi+ebx+56]
3591 movq [edi+ebx+48], mm4
3592 add ebx, 64
3593 paddb mm6, mm7
3594 cmp ebx, ecx
3595 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3596 // -8 to offset add ebx
3597 jb duploop
3599 cmp edx, 0 // Test for bytes over mult of 64
3600 jz dupend
3603 // 2 lines added by lcreeve@netins.net
3604 // (mail 11 Jul 98 in png-implement list)
3605 cmp edx, 8 //test for less than 8 bytes
3606 jb duplt8
3609 add ecx, edx
3610 and edx, 0x00000007 // calc bytes over mult of 8
3611 sub ecx, edx // drop over bytes from length
3612 jz duplt8
3613 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3614 duplpA:
3615 movq mm1, [esi+ebx]
3616 movq mm0, [edi+ebx]
3617 add ebx, 8
3618 paddb mm0, mm1
3619 cmp ebx, ecx
3620 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3621 jb duplpA
3622 cmp edx, 0 // Test for bytes over mult of 8
3623 jz dupend
3624 duplt8:
3625 xor eax, eax
3626 add ecx, edx // move over byte count into counter
3627 // Loop using x86 registers to update remaining bytes
3628 duplp2:
3629 mov al, [edi + ebx]
3630 add al, [esi + ebx]
3631 inc ebx
3632 cmp ebx, ecx
3633 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3634 jb duplp2
3635 dupend:
3636 // Conversion of filtered row completed
3637 emms // End MMX instructions; prep for possible FP instrs.
3638 } // end _asm block
3639 }
3642 // Optimized png_read_filter_row routines
3643 void /* PRIVATE */
3644 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3645 row, png_bytep prev_row, int filter)
3646 {
3647 #ifdef PNG_DEBUG
3648 char filnm[10];
3649 #endif
3651 if (mmx_supported == 2) {
3652 /* this should have happened in png_init_mmx_flags() already */
3653 png_warning(png_ptr, "asm_flags may not have been initialized");
3654 png_mmx_support();
3655 }
3657 #ifdef PNG_DEBUG
3658 png_debug(1, "in png_read_filter_row\n");
3659 switch (filter)
3660 {
3661 case 0: sprintf(filnm, "none");
3662 break;
3663 case 1: sprintf(filnm, "sub-%s",
3664 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3665 break;
3666 case 2: sprintf(filnm, "up-%s",
3667 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3668 break;
3669 case 3: sprintf(filnm, "avg-%s",
3670 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3671 break;
3672 case 4: sprintf(filnm, "Paeth-%s",
3673 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3674 break;
3675 default: sprintf(filnm, "unknw");
3676 break;
3677 }
3678 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3679 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3680 (int)((row_info->pixel_depth + 7) >> 3));
3681 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3682 #endif /* PNG_DEBUG */
3684 switch (filter)
3685 {
3686 case PNG_FILTER_VALUE_NONE:
3687 break;
3689 case PNG_FILTER_VALUE_SUB:
3690 {
3691 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3692 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3693 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3694 {
3695 png_read_filter_row_mmx_sub(row_info, row);
3696 }
3697 else
3698 {
3699 png_uint_32 i;
3700 png_uint_32 istop = row_info->rowbytes;
3701 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3702 png_bytep rp = row + bpp;
3703 png_bytep lp = row;
3705 for (i = bpp; i < istop; i++)
3706 {
3707 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3708 rp++;
3709 }
3710 }
3711 break;
3712 }
3714 case PNG_FILTER_VALUE_UP:
3715 {
3716 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3717 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3718 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3719 {
3720 png_read_filter_row_mmx_up(row_info, row, prev_row);
3721 }
3722 else
3723 {
3724 png_uint_32 i;
3725 png_uint_32 istop = row_info->rowbytes;
3726 png_bytep rp = row;
3727 png_bytep pp = prev_row;
3729 for (i = 0; i < istop; ++i)
3730 {
3731 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3732 rp++;
3733 }
3734 }
3735 break;
3736 }
3738 case PNG_FILTER_VALUE_AVG:
3739 {
3740 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3741 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3742 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3743 {
3744 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3745 }
3746 else
3747 {
3748 png_uint_32 i;
3749 png_bytep rp = row;
3750 png_bytep pp = prev_row;
3751 png_bytep lp = row;
3752 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3753 png_uint_32 istop = row_info->rowbytes - bpp;
3755 for (i = 0; i < bpp; i++)
3756 {
3757 *rp = (png_byte)(((int)(*rp) +
3758 ((int)(*pp++) >> 1)) & 0xff);
3759 rp++;
3760 }
3762 for (i = 0; i < istop; i++)
3763 {
3764 *rp = (png_byte)(((int)(*rp) +
3765 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3766 rp++;
3767 }
3768 }
3769 break;
3770 }
3772 case PNG_FILTER_VALUE_PAETH:
3773 {
3774 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3775 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3776 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3777 {
3778 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3779 }
3780 else
3781 {
3782 png_uint_32 i;
3783 png_bytep rp = row;
3784 png_bytep pp = prev_row;
3785 png_bytep lp = row;
3786 png_bytep cp = prev_row;
3787 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3788 png_uint_32 istop=row_info->rowbytes - bpp;
3790 for (i = 0; i < bpp; i++)
3791 {
3792 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3793 rp++;
3794 }
3796 for (i = 0; i < istop; i++) // use leftover rp,pp
3797 {
3798 int a, b, c, pa, pb, pc, p;
3800 a = *lp++;
3801 b = *pp++;
3802 c = *cp++;
3804 p = b - c;
3805 pc = a - c;
3807 #ifdef PNG_USE_ABS
3808 pa = abs(p);
3809 pb = abs(pc);
3810 pc = abs(p + pc);
3811 #else
3812 pa = p < 0 ? -p : p;
3813 pb = pc < 0 ? -pc : pc;
3814 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3815 #endif
3817 /*
3818 if (pa <= pb && pa <= pc)
3819 p = a;
3820 else if (pb <= pc)
3821 p = b;
3822 else
3823 p = c;
3824 */
3826 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3828 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3829 rp++;
3830 }
3831 }
3832 break;
3833 }
3835 default:
3836 png_warning(png_ptr, "Ignoring bad row filter type");
3837 *row=0;
3838 break;
3839 }
3840 }
3842 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */