Changes - j0ke.net Open Build Service

Changes of Revision 2

[-] [+]	Changed	x264.changes
@@ -1,5 +1,5 @@ ------------------------------------------------------------------- -Sun Dec 18 15:53:34 UTC 2011 - cs@linux-administrator.com +Sun Feb 26 10:37:21 UTC 2012 - cs@linux-administrator.com -- re-import +- re-import 1 @@ -1,5 +1,5 @@ 2 ------------------------------------------------------------------- 3 -Sun Dec 18 15:53:34 UTC 2011 - cs@linux-administrator.com 4 +Sun Feb 26 10:37:21 UTC 2012 - cs@linux-administrator.com 5 6 -- re-import 7 +- re-import 8 9
[-] [+]	Changed	x264.spec ^
@@ -1,13 +1,13 @@ # norootforbuild -%define soname 119 -%define svn 20111122 +%define soname 120 +%define svn 20120126 %define realname libx264 Name: x264 Summary: A free h264/avc encoder - encoder binary Version: 0.%{soname}svn%{svn} -Release: 1.1 +Release: 3 License: GPLv2+ Group: Productivity/Multimedia/Video/Editors and Convertors URL: http://developers.videolan.org/x264.html @@ -123,102 +123,3 @@ %{_libdir}/libx264.so %changelog -* Fri Sep 23 2011 Manfred.Tremmel@iiv.de -- update to snapshot 20111122 -* Thu Sep 8 2011 Manfred.Tremmel@iiv.de -- update to snapshot 20110907 -* Thu Jun 23 2011 Manfred.Tremmel@iiv.de -- update to snapshot 20110622 -* Sat May 28 2011 Manfred.Tremmel@iiv.de -- update to snapshot 20110527 -* Sat Apr 23 2011 reddwarf@opensuse.org -- remove build timestamp -- remove execution permissions from library -* Sat Feb 26 2011 Manfred.Tremmel@iiv.de -- update to snapshot 20110225 -* Sun Jan 16 2011 Manfred.Tremmel@iiv.de -- update to snapshot 20110115 -* Sun Oct 17 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20101016 -* Sun Oct 3 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20101002 -* Tue Jun 29 2010 ludwig.nussel@gmx.de -- require pkg-config -- link binary against shared library -* Wed Jun 16 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100615 -* Tue May 18 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100517 -* Mon Apr 26 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100425 - now able to create Blue Ray compatible h.264 streams -* Sat Apr 3 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100402 -* Sun Feb 28 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100227 -* Wed Feb 17 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100216 -* Sat Feb 6 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100205 -* Wed Jan 27 2010 Manfred.Tremmel@iiv.de -- rebuild because of no submit with the last build -* Sat Jan 23 2010 Manfred.Tremmel@iiv.de -- update to snapshot 20100122 -* Sat Jan 2 2010 Manfred.Tremmel@iiv.de -- update to svn 20100101 -* Tue Dec 15 2009 Manfred.Tremmel@iiv.de -- added a patch to fix broken ffmpeg defaults instead of aborting -* Fri Dec 11 2009 Manfred.Tremmel@iiv.de -- update to svn 20091211 -* Mon Nov 23 2009 Manfred.Tremmel@iiv.de -- update to svn 20091123 -* Tue Aug 25 2009 Manfred.Tremmel@iiv.de -- updated to snapshot 20090624 -* Sat Jun 27 2009 Manfred.Tremmel@iiv.de -- updated to snapshot 20090627 -* Sun May 10 2009 Manfred.Tremmel@iiv.de -- updated to snapshot 20090510 -* Tue Mar 10 2009 Manfred.Tremmel@iiv.de -- updated to snapshot 20090310 -* Sat Feb 7 2009 Manfred.Tremmel@iiv.de -- updated to snapshot 20090206 -* Thu Nov 6 2008 Manfred.Tremmel@iiv.de -- updated to snapshot 20081105 -* Sat Oct 4 2008 Manfred.Tremmel@iiv.de -- updated to snapshot 20081004 -* Wed Sep 17 2008 Manfred.Tremmel@iiv.de -- updated to snapshot 20080917 -* Thu Aug 14 2008 Manfred.Tremmel@iiv.de -- updated to snapshot 20080814 -- recompile with new yasm version -* Sat Aug 9 2008 Manfred.Tremmel@iiv.de -- updated to snapshot 20080809 -* Tue Jun 17 2008 Manfred.Tremmel@iiv.de -- updated to snapshot 20080617 -- also included snapshot 20071225 lib for compatibility reasons -* Wed Apr 30 2008 guru@unixtech.be -- fixed file permissions, thanks to Christian Morales Vega <cmorve69@yahoo.es> -* Mon Dec 31 2007 guru@unixtech.be -- made Requires in main package require the exact version-release -- fixed License tag according to 10.3 packaging policies -- added Provides/Obsoletes for proper upgrading of x264=>libx264-devel -* Wed Dec 26 2007 leon@links2linux.de -- updated to snapshot 20071225 -- changed the spec according to the new library policy, thanks Detlef -- changed the group according to SPC guidelines -* Mon Nov 6 2006 detlef@links2linux.de -- update to snapshot-20061031-2245 -* Wed Nov 1 2006 leon@links2linux.de -- new release -* Wed Apr 26 2006 leon@links2linux.de -- updated to the newest tarball (to fix PPC compiling) -- match the new library revision (libx264.so.46) -* Tue Apr 18 2006 leon@links2linux.de -- updated to the newest tarball -- removed the syntax patch since it has been merged -- remove yasm from BuildRequires on x86 -* Wed Mar 22 2006 henne@links2linux.de -- introduce a build section <:) -- full url for source -* Sat Mar 18 2006 leon@links2linux.de -- Initial release for packman. 121 1 @@ -1,13 +1,13 @@ 2 # norootforbuild 3 4 -%define soname 119 5 -%define svn 20111122 6 +%define soname 120 7 +%define svn 20120126 8 %define realname libx264 9 10 Name: x264 11 Summary: A free h264/avc encoder - encoder binary 12 Version: 0.%{soname}svn%{svn} 13 -Release: 1.1 14 +Release: 3 15 License: GPLv2+ 16 Group: Productivity/Multimedia/Video/Editors and Convertors 17 URL: http://developers.videolan.org/x264.html 18 @@ -123,102 +123,3 @@ 19 %{_libdir}/libx264.so 20 21 %changelog 22 -* Fri Sep 23 2011 Manfred.Tremmel@iiv.de 23 -- update to snapshot 20111122 24 -* Thu Sep 8 2011 Manfred.Tremmel@iiv.de 25 -- update to snapshot 20110907 26 -* Thu Jun 23 2011 Manfred.Tremmel@iiv.de 27 -- update to snapshot 20110622 28 -* Sat May 28 2011 Manfred.Tremmel@iiv.de 29 -- update to snapshot 20110527 30 -* Sat Apr 23 2011 reddwarf@opensuse.org 31 -- remove build timestamp 32 -- remove execution permissions from library 33 -* Sat Feb 26 2011 Manfred.Tremmel@iiv.de 34 -- update to snapshot 20110225 35 -* Sun Jan 16 2011 Manfred.Tremmel@iiv.de 36 -- update to snapshot 20110115 37 -* Sun Oct 17 2010 Manfred.Tremmel@iiv.de 38 -- update to snapshot 20101016 39 -* Sun Oct 3 2010 Manfred.Tremmel@iiv.de 40 -- update to snapshot 20101002 41 -* Tue Jun 29 2010 ludwig.nussel@gmx.de 42 -- require pkg-config 43 -- link binary against shared library 44 -* Wed Jun 16 2010 Manfred.Tremmel@iiv.de 45 -- update to snapshot 20100615 46 -* Tue May 18 2010 Manfred.Tremmel@iiv.de 47 -- update to snapshot 20100517 48 -* Mon Apr 26 2010 Manfred.Tremmel@iiv.de 49 -- update to snapshot 20100425 50 - now able to create Blue Ray compatible h.264 streams 51 -* Sat Apr 3 2010 Manfred.Tremmel@iiv.de 52 -- update to snapshot 20100402 53 -* Sun Feb 28 2010 Manfred.Tremmel@iiv.de 54 -- update to snapshot 20100227 55 -* Wed Feb 17 2010 Manfred.Tremmel@iiv.de 56 -- update to snapshot 20100216 57 -* Sat Feb 6 2010 Manfred.Tremmel@iiv.de 58 -- update to snapshot 20100205 59 -* Wed Jan 27 2010 Manfred.Tremmel@iiv.de 60 -- rebuild because of no submit with the last build 61 -* Sat Jan 23 2010 Manfred.Tremmel@iiv.de 62 -- update to snapshot 20100122 63 -* Sat Jan 2 2010 Manfred.Tremmel@iiv.de 64 -- update to svn 20100101 65 -* Tue Dec 15 2009 Manfred.Tremmel@iiv.de 66 -- added a patch to fix broken ffmpeg defaults instead of aborting 67 -* Fri Dec 11 2009 Manfred.Tremmel@iiv.de 68 -- update to svn 20091211 69 -* Mon Nov 23 2009 Manfred.Tremmel@iiv.de 70 -- update to svn 20091123 71 -* Tue Aug 25 2009 Manfred.Tremmel@iiv.de 72 -- updated to snapshot 20090624 73 -* Sat Jun 27 2009 Manfred.Tremmel@iiv.de 74 -- updated to snapshot 20090627 75 -* Sun May 10 2009 Manfred.Tremmel@iiv.de 76 -- updated to snapshot 20090510 77 -* Tue Mar 10 2009 Manfred.Tremmel@iiv.de 78 -- updated to snapshot 20090310 79 -* Sat Feb 7 2009 Manfred.Tremmel@iiv.de 80 -- updated to snapshot 20090206 81 -* Thu Nov 6 2008 Manfred.Tremmel@iiv.de 82 -- updated to snapshot 20081105 83 -* Sat Oct 4 2008 Manfred.Tremmel@iiv.de 84 -- updated to snapshot 20081004 85 -* Wed Sep 17 2008 Manfred.Tremmel@iiv.de 86 -- updated to snapshot 20080917 87 -* Thu Aug 14 2008 Manfred.Tremmel@iiv.de 88 -- updated to snapshot 20080814 89 -- recompile with new yasm version 90 -* Sat Aug 9 2008 Manfred.Tremmel@iiv.de 91 -- updated to snapshot 20080809 92 -* Tue Jun 17 2008 Manfred.Tremmel@iiv.de 93 -- updated to snapshot 20080617 94 -- also included snapshot 20071225 lib for compatibility reasons 95 -* Wed Apr 30 2008 guru@unixtech.be 96 -- fixed file permissions, thanks to Christian Morales Vega <cmorve69@yahoo.es> 97 -* Mon Dec 31 2007 guru@unixtech.be 98 -- made Requires in main package require the exact version-release 99 -- fixed License tag according to 10.3 packaging policies 100 -- added Provides/Obsoletes for proper upgrading of x264=>libx264-devel 101 -* Wed Dec 26 2007 leon@links2linux.de 102 -- updated to snapshot 20071225 103 -- changed the spec according to the new library policy, thanks Detlef 104 -- changed the group according to SPC guidelines 105 -* Mon Nov 6 2006 detlef@links2linux.de 106 -- update to snapshot-20061031-2245 107 -* Wed Nov 1 2006 leon@links2linux.de 108 -- new release 109 -* Wed Apr 26 2006 leon@links2linux.de 110 -- updated to the newest tarball (to fix PPC compiling) 111 -- match the new library revision (libx264.so.46) 112 -* Tue Apr 18 2006 leon@links2linux.de 113 -- updated to the newest tarball 114 -- removed the syntax patch since it has been merged 115 -- remove yasm from BuildRequires on x86 116 -* Wed Mar 22 2006 henne@links2linux.de 117 -- introduce a build section <:) 118 -- full url for source 119 -* Sat Mar 18 2006 leon@links2linux.de 120 -- Initial release for packman. 121
[-] [+]	Changed	x264-use-shared-library.patch ^
@@ -1,15 +1,21 @@ ---- Makefile.orig 2011-05-27 22:45:04.000000000 +0200 -+++ Makefile 2011-05-28 15:18:29.883305471 +0200 -@@ -149,9 +149,10 @@ +--- Makefile.orig 2011-12-26 22:45:03.000000000 +0100 ++++ Makefile 2011-12-27 20:03:46.070404383 +0100 +@@ -152,6 +152,7 @@ $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + ln -s $(SONAME) libx264.so + ifneq ($(EXE),) + .PHONY: x264 checkasm +@@ -159,8 +160,8 @@ + checkasm: checkasm$(EXE) + endif + -x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) - $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) +x264$(EXE): .depend $(OBJCLI) $(SONAME) + $(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS) - checkasm: tools/checkasm.o $(LIBX264) - $(LD)$@ $+ $(LDFLAGS) + checkasm$(EXE): .depend $(OBJCHK) $(LIBX264) + $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/Makefile ^
@@ -2,6 +2,11 @@ include config.mak +vpath %.c $(SRCPATH) +vpath %.h $(SRCPATH) +vpath %.S $(SRCPATH) +vpath %.asm $(SRCPATH) + all: default SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \ @@ -22,6 +27,8 @@ SRCSO = +OBJCHK = tools/checkasm.o + CONFIG := $(shell cat config.h) # GPL-only files @@ -79,16 +86,16 @@ ifeq ($(ARCH),X86_64) ARCH_X86 = yes -ASMSRC = $(X86SRC:-32.asm=-64.asm) +ASMSRC = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm ASFLAGS += -DARCH_X86_64 endif ifdef ARCH_X86 -ASFLAGS += -Icommon/x86/ +ASFLAGS += -I$(SRCPATH)/common/x86/ SRCS += common/x86/mc-c.c common/x86/predict-c.c OBJASM = $(ASMSRC:%.asm=%.o) $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm -checkasm: tools/checkasm-a.o +OBJCHK += tools/checkasm-a.o endif endif @@ -135,7 +142,7 @@ OBJSO = $(SRCSO:%.c=%.o) DEP = depend -.PHONY: all default fprofiled clean distclean install uninstall dox test testclean lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli +.PHONY: all default fprofiled clean distclean install uninstall lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli default: $(DEP) @@ -144,17 +151,26 @@ lib-shared: $(SONAME) $(LIBX264): .depend $(OBJS) $(OBJASM) + rm -f $(LIBX264) $(AR)$@ $(OBJS) $(OBJASM) $(if $(RANLIB), $(RANLIB) $@) $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) +ifneq ($(EXE),) +.PHONY: x264 checkasm +x264: x264$(EXE) +checkasm: checkasm$(EXE) +endif + x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) -checkasm: tools/checkasm.o $(LIBX264) - $(LD)$@ $+ $(LDFLAGS) +checkasm$(EXE): .depend $(OBJCHK) $(LIBX264) + $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS) + +$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend %.o: %.asm $(AS) $(ASFLAGS) -o $@ $< @@ -166,7 +182,7 @@ .depend: config.mak @rm -f .depend - @$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:%.c=%.o) $(DEPMM) 1>> .depend;) + @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;) config.mak: ./configure @@ -204,12 +220,11 @@ clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) .a .lib .exp .pdb x264 x264.exe .depend TAGS - rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o + rm -f checkasm checkasm.exe $(OBJCHK) rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock distclean: clean rm -f config.mak x264_config.h config.h config.log x264.pc x264.def - rm -rf test/ install-cli: cli install -d $(DESTDIR)$(bindir) @@ -219,7 +234,7 @@ install -d $(DESTDIR)$(includedir) install -d $(DESTDIR)$(libdir) install -d $(DESTDIR)$(libdir)/pkgconfig - install -m 644 x264.h $(DESTDIR)$(includedir) + install -m 644 $(SRCPATH)/x264.h $(DESTDIR)$(includedir) install -m 644 x264_config.h $(DESTDIR)$(includedir) install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/arm/predict-a.S ^
@@ -102,6 +102,21 @@ bx lr .endfunc +function x264_predict_4x4_dc_top_neon + mov r12, #FDEC_STRIDE + sub r1, r0, #FDEC_STRIDE + vld1.32 d1[], [r1,:32] + vpaddl.u8 d1, d1 + vpadd.u16 d1, d1, d1 + vrshr.u16 d1, d1, #2 + vdup.8 d1, d1[0] + vst1.32 d1[0], [r0,:32], r12 + vst1.32 d1[0], [r0,:32], r12 + vst1.32 d1[0], [r0,:32], r12 + vst1.32 d1[0], [r0,:32], r12 + bx lr +.endfunc + // return a1 = (a1+2b1+c1+2)>>2 a2 = (a2+2b2+c2+2)>>2 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 uhadd8 \a1, \a1, \c1 @@ -211,6 +226,202 @@ bx lr .endfunc +function x264_predict_8x8_v_neon + add r1, r1, #16 + mov r12, #FDEC_STRIDE + vld1.8 {d0}, [r1,:64] +.rept 8 + vst1.8 {d0}, [r0,:64], r12 +.endr + bx lr +.endfunc + +function x264_predict_8x8_ddl_neon + add r1, #16 + vld1.8 {d0, d1}, [r1,:128] + vmov.i8 q3, #0 + vrev64.8 d2, d1 + vext.8 q8, q3, q0, #15 + vext.8 q2, q0, q1, #1 + vhadd.u8 q8, q2 + mov r12, #FDEC_STRIDE + vrhadd.u8 q0, q8 + vext.8 d2, d0, d1, #1 + vext.8 d3, d0, d1, #2 + vst1.8 d2, [r0,:64], r12 + vext.8 d2, d0, d1, #3 + vst1.8 d3, [r0,:64], r12 + vext.8 d3, d0, d1, #4 + vst1.8 d2, [r0,:64], r12 + vext.8 d2, d0, d1, #5 + vst1.8 d3, [r0,:64], r12 + vext.8 d3, d0, d1, #6 + vst1.8 d2, [r0,:64], r12 + vext.8 d2, d0, d1, #7 + vst1.8 d3, [r0,:64], r12 + vst1.8 d2, [r0,:64], r12 + vst1.8 d1, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_ddr_neon + vld1.8 {d0-d3}, [r1,:128] + vext.8 q2, q0, q1, #7 + vext.8 q3, q0, q1, #9 + + vhadd.u8 q2, q2, q3 + vrhadd.u8 d0, d1, d4 + vrhadd.u8 d1, d2, d5 + + add r0, #7FDEC_STRIDE + mov r12, #-1FDEC_STRIDE + + vext.8 d2, d0, d1, #1 + vst1.8 {d0}, [r0,:64], r12 + vext.8 d4, d0, d1, #2 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d5, d0, d1, #3 + vst1.8 {d4}, [r0,:64], r12 + vext.8 d4, d0, d1, #4 + vst1.8 {d5}, [r0,:64], r12 + vext.8 d5, d0, d1, #5 + vst1.8 {d4}, [r0,:64], r12 + vext.8 d4, d0, d1, #6 + vst1.8 {d5}, [r0,:64], r12 + vext.8 d5, d0, d1, #7 + vst1.8 {d4}, [r0,:64], r12 + vst1.8 {d5}, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_vl_neon + add r1, #16 + mov r12, #FDEC_STRIDE + + vld1.8 {d0, d1}, [r1,:128] + vext.8 q1, q1, q0, #15 + vext.8 q2, q0, q2, #1 + + vrhadd.u8 q3, q0, q2 + + vhadd.u8 q1, q1, q2 + vrhadd.u8 q0, q0, q1 + + vext.8 d2, d0, d1, #1 + vst1.8 {d6}, [r0,:64], r12 + vext.8 d3, d6, d7, #1 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #2 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d3, d6, d7, #2 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #3 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d3, d6, d7, #3 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #4 + vst1.8 {d3}, [r0,:64], r12 + vst1.8 {d2}, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_vr_neon + add r1, #8 + mov r12, #FDEC_STRIDE + vld1.8 {d4,d5}, [r1,:64] + + vext.8 q1, q2, q2, #14 + vext.8 q0, q2, q2, #15 + + vhadd.u8 q3, q2, q1 + vrhadd.u8 q2, q2, q0 + vrhadd.u8 q0, q0, q3 + + vmov d2, d0 + + vst1.8 {d5}, [r0,:64], r12 + vuzp.8 d2, d0 + vst1.8 {d1}, [r0,:64], r12 + vext.8 d6, d0, d5, #7 + vext.8 d3, d2, d1, #7 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d6, d0, d5, #6 + vext.8 d3, d2, d1, #6 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d6, d0, d5, #5 + vext.8 d3, d2, d1, #5 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d3}, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_hd_neon + mov r12, #FDEC_STRIDE + add r1, #7 + + vld1.8 {d2,d3}, [r1] + vext.8 q3, q1, q1, #1 + vext.8 q2, q1, q1, #2 + + vrhadd.u8 q8, q1, q3 + + vhadd.u8 q1, q2 + vrhadd.u8 q0, q1, q3 + + vzip.8 d16, d0 + + vext.8 d2, d0, d1, #6 + vext.8 d3, d0, d1, #4 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #2 + vst1.8 {d3}, [r0,:64], r12 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d16, d0, #6 + vst1.8 {d0}, [r0,:64], r12 + vext.8 d3, d16, d0, #4 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d16, d0, #2 + vst1.8 {d3}, [r0,:64], r12 + vst1.8 {d2}, [r0,:64], r12 + vst1.8 {d16}, [r0,:64], r12 + + bx lr +.endfunc + +function x264_predict_8x8_hu_neon + mov r12, #FDEC_STRIDE + add r1, #7 + vld1.8 {d7}, [r1] + vdup.8 d6, d7[0] + vrev64.8 d7, d7 + + vext.8 d4, d7, d6, #2 + vext.8 d2, d7, d6, #1 + + vhadd.u8 d16, d7, d4 + vrhadd.u8 d0, d2, d7 + vrhadd.u8 d1, d16, d2 + + vzip.8 d0, d1 + + vdup.16 q1, d1[3] + + vext.8 q2, q0, q1, #2 + vext.8 q3, q0, q1, #4 + vext.8 q8, q0, q1, #6 + vst1.8 {d0}, [r0,:64], r12 + vst1.8 {d4}, [r0,:64], r12 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d16}, [r0,:64], r12 + + vst1.8 {d1}, [r0,:64], r12 + vst1.8 {d5}, [r0,:64], r12 + vst1.8 {d7}, [r0,:64], r12 + vst1.8 {d17}, [r0,:64] + bx lr +.endfunc function x264_predict_8x8c_dc_top_neon sub r2, r0, #FDEC_STRIDE @@ -223,7 +434,7 @@ vdup.8 d0, d0[0] vtrn.32 d0, d1 b pred8x8_dc_end - .endfunc +.endfunc function x264_predict_8x8c_dc_left_neon mov r1, #FDEC_STRIDE
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/arm/predict-c.c ^
@@ -28,6 +28,7 @@ #include "pixel.h" void x264_predict_4x4_dc_armv6( uint8_t src ); +void x264_predict_4x4_dc_top_neon( uint8_t src ); void x264_predict_4x4_h_armv6( uint8_t src ); void x264_predict_4x4_ddr_armv6( uint8_t src ); void x264_predict_4x4_ddl_neon( uint8_t src ); @@ -40,7 +41,14 @@ void x264_predict_8x8c_p_neon( uint8_t src ); void x264_predict_8x8_dc_neon( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_ddl_neon( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_v_neon( uint8_t src, uint8_t edge[36] ); void x264_predict_8x8_h_neon( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t src, uint8_t edge[36] ); void x264_predict_16x16_dc_neon( uint8_t src ); void x264_predict_16x16_dc_top_neon( uint8_t *src ); @@ -62,6 +70,7 @@ if (!(cpu&X264_CPU_NEON)) return; + pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; #endif // !HIGH_BIT_DEPTH } @@ -87,8 +96,15 @@ return; #if !HIGH_BIT_DEPTH + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon; #endif // !HIGH_BIT_DEPTH }
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/bitstream.h ^
@@ -56,6 +56,7 @@ typedef struct { int last; + int mask; dctcoef level[16]; uint8_t run[16]; } x264_run_level_t; @@ -65,7 +66,6 @@ extern const vlc_t x264_total_zeros[15][16]; extern const vlc_t x264_total_zeros_2x2_dc[3][4]; extern const vlc_t x264_total_zeros_2x4_dc[7][8]; -extern const vlc_t x264_run_before[7][16]; typedef struct { @@ -82,6 +82,11 @@ #define LEVEL_TABLE_SIZE 128 extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; +/* The longest possible set of zero run codes sums to 25 bits. This leaves + * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. / + +extern uint32_t x264_run_before[1<<16]; + static inline void bs_init( bs_t s, void *p_data, int i_data ) { int offset = ((intptr_t)p_data & 3);
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/common.h ^
@@ -236,7 +236,7 @@ void x264_reduce_fraction( uint32_t n, uint32_t d ); void x264_reduce_fraction64( uint64_t n, uint64_t d ); -void x264_cavlc_init( void ); +void x264_cavlc_init( x264_t h ); void x264_cabac_init( x264_t h ); static ALWAYS_INLINE pixel x264_clip_pixel( int x ) @@ -498,6 +498,8 @@ udctcoef (quant8_mf[4])[64]; / [4][52][64] / udctcoef (quant4_bias[4])[16]; /* [4][52][16] / udctcoef (quant8_bias[4])[64]; /* [4][52][64] / + udctcoef (quant4_bias0[4])[16]; /* [4][52][16] / + udctcoef (quant8_bias0[4])[64]; /* [4][52][64] / udctcoef (nr_offset_emergency)[4][64]; /* mv/ref cost arrays. */
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/dct.c ^
@@ -36,8 +36,69 @@ # include "arm/dct.h" #endif -uint16_t x264_dct4_weight2_zigzag[2][16]; -uint16_t x264_dct8_weight2_zigzag[2][64]; +/* the inverse of the scaling factors introduced by 8x8 fdct / +/ uint32 is for the asm implementation of trellis. the actual values fit in uint16. / +#define W(i) (i==0 ? FIX8(1.0000) :\ + i==1 ? FIX8(0.8859) :\ + i==2 ? FIX8(1.6000) :\ + i==3 ? FIX8(0.9415) :\ + i==4 ? FIX8(1.2651) :\ + i==5 ? FIX8(1.1910) :0) +const uint32_t x264_dct8_weight_tab[64] = { + W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), + W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), + + W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), + W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1) +}; +#undef W + +#define W(i) (i==0 ? FIX8(1.76777) :\ + i==1 ? FIX8(1.11803) :\ + i==2 ? FIX8(0.70711) :0) +const uint32_t x264_dct4_weight_tab[16] = { + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2), + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2) +}; +#undef W + +/ inverse squared / +#define W(i) (i==0 ? FIX8(3.125) :\ + i==1 ? FIX8(1.25) :\ + i==2 ? FIX8(0.5) :0) +const uint32_t x264_dct4_weight2_tab[16] = { + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2), + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2) +}; +#undef W + +#define W(i) (i==0 ? FIX8(1.00000) :\ + i==1 ? FIX8(0.78487) :\ + i==2 ? FIX8(2.56132) :\ + i==3 ? FIX8(0.88637) :\ + i==4 ? FIX8(1.60040) :\ + i==5 ? FIX8(1.41850) :0) +const uint32_t x264_dct8_weight2_tab[64] = { + W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), + W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), + + W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), + W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), + W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1) +}; +#undef W + static void dct4x4dc( dctcoef d[16] ) { @@ -509,19 +570,35 @@ dctf->add4x4_idct = x264_add4x4_idct_sse2; dctf->dct4x4dc = x264_dct4x4dc_sse2; dctf->idct4x4dc = x264_idct4x4dc_sse2; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; dctf->add8x8_idct = x264_add8x8_idct_sse2; dctf->add16x16_idct = x264_add16x16_idct_sse2; + dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; + dctf->add16x16_idct8 = x264_add16x16_idct8_sse2; + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2; dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2; } + if( cpu&X264_CPU_SSE4 ) + { + dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4; + } if( cpu&X264_CPU_AVX ) { dctf->add4x4_idct = x264_add4x4_idct_avx; dctf->dct4x4dc = x264_dct4x4dc_avx; dctf->idct4x4dc = x264_idct4x4dc_avx; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; dctf->add8x8_idct = x264_add8x8_idct_avx; dctf->add16x16_idct = x264_add16x16_idct_avx; + dctf->add8x8_idct8 = x264_add8x8_idct8_avx; + dctf->add16x16_idct8 = x264_add16x16_idct8_avx; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx; dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx; } #endif // HAVE_MMX @@ -555,6 +632,7 @@ dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; + dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2; dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8= x264_add16x16_idct8_sse2; @@ -572,6 +650,7 @@ dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; } @@ -592,6 +671,12 @@ dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; } + + if( cpu&X264_CPU_XOP ) + { + dctf->sub8x8_dct = x264_sub8x8_dct_xop; + dctf->sub16x16_dct = x264_sub16x16_dct_xop; + } #endif //HAVE_MMX #if HAVE_ALTIVEC @@ -639,17 +724,6 @@ #endif // HIGH_BIT_DEPTH } -void x264_dct_init_weights( void ) -{ - for( int j = 0; j < 2; j++ ) - { - for( int i = 0; i < 16; i++ ) - x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ]; - for( int i = 0; i < 64; i++ ) - x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ]; - } -} - #define ZIG(i,y,x) level[i] = dct[x8+y]; #define ZIGZAG8_FRAME\
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/dct.h ^
@@ -26,70 +26,10 @@ #ifndef X264_DCT_H #define X264_DCT_H -/* the inverse of the scaling factors introduced by 8x8 fdct / -#define W(i) (i==0 ? FIX8(1.0000) :\ - i==1 ? FIX8(0.8859) :\ - i==2 ? FIX8(1.6000) :\ - i==3 ? FIX8(0.9415) :\ - i==4 ? FIX8(1.2651) :\ - i==5 ? FIX8(1.1910) :0) -static const uint16_t x264_dct8_weight_tab[64] = { - W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), - W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), - - W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), - W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1) -}; -#undef W - -#define W(i) (i==0 ? FIX8(1.76777) :\ - i==1 ? FIX8(1.11803) :\ - i==2 ? FIX8(0.70711) :0) -static const uint16_t x264_dct4_weight_tab[16] = { - W(0), W(1), W(0), W(1), - W(1), W(2), W(1), W(2), - W(0), W(1), W(0), W(1), - W(1), W(2), W(1), W(2) -}; -#undef W - -/ inverse squared */ -#define W(i) (i==0 ? FIX8(3.125) :\ - i==1 ? FIX8(1.25) :\ - i==2 ? FIX8(0.5) :0) -static const uint16_t x264_dct4_weight2_tab[16] = { - W(0), W(1), W(0), W(1), - W(1), W(2), W(1), W(2), - W(0), W(1), W(0), W(1), - W(1), W(2), W(1), W(2) -}; -#undef W - -#define W(i) (i==0 ? FIX8(1.00000) :\ - i==1 ? FIX8(0.78487) :\ - i==2 ? FIX8(2.56132) :\ - i==3 ? FIX8(0.88637) :\ - i==4 ? FIX8(1.60040) :\ - i==5 ? FIX8(1.41850) :0) -static const uint16_t x264_dct8_weight2_tab[64] = { - W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), - W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), - - W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), - W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), - W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1) -}; -#undef W - -extern uint16_t x264_dct4_weight2_zigzag[2][16]; // [2] = {frame, field} -extern uint16_t x264_dct8_weight2_zigzag[2][64]; +extern const uint32_t x264_dct4_weight_tab[16]; +extern const uint32_t x264_dct8_weight_tab[64]; +extern const uint32_t x264_dct4_weight2_tab[16]; +extern const uint32_t x264_dct8_weight2_tab[64]; typedef struct {
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/deblock.c ^
@@ -165,13 +165,7 @@ } static void deblock_h_chroma_mbaff_c( pixel pix, int stride, int alpha, int beta, int8_t tc0 ) { - for( int i = 0; i < 4; i++, pix += stride ) - deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] ); -} -static void deblock_h_chroma_422_mbaff_c( pixel pix, int stride, int alpha, int beta, int8_t tc0 ) -{ - for( int i = 0; i < 8; i++, pix += stride ) - deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i>>1] ); + deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 ); } static void deblock_v_chroma_c( pixel pix, int stride, int alpha, int beta, int8_t tc0 ) { @@ -265,13 +259,7 @@ } static void deblock_h_chroma_intra_mbaff_c( pixel pix, int stride, int alpha, int beta ) { - for( int i = 0; i < 4; i++, pix += stride ) - deblock_edge_chroma_intra_c( pix, 2, alpha, beta ); -} -static void deblock_h_chroma_422_intra_mbaff_c( pixel pix, int stride, int alpha, int beta ) -{ - for( int i = 0; i < 8; i++, pix += stride ) - deblock_edge_chroma_intra_c( pix, 2, alpha, beta ); + deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta ); } static void deblock_v_chroma_intra_c( pixel pix, int stride, int alpha, int beta ) { @@ -474,13 +462,15 @@ { deblock_edge_intra( h, pixy, 2stridey, bs[0][0], luma_qp[0], a, b, 0, luma_intra_deblock ); deblock_edge_intra( h, pixuv, 2strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); - deblock_edge_intra( h, pixuv + uvdiff, 2strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); + if( chroma444 ) + deblock_edge_intra( h, pixuv + uvdiff, 2strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); } else { deblock_edge( h, pixy, 2stridey, bs[0][0], luma_qp[0], a, b, 0, luma_deblock ); deblock_edge( h, pixuv, 2strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); - deblock_edge( h, pixuv + uvdiff, 2strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); + if( chroma444 ) + deblock_edge( h, pixuv + uvdiff, 2strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); } int offy = MB_INTERLACED ? 4 : 0; @@ -492,13 +482,15 @@ { deblock_edge_intra( h, pixy + (stridey<<offy), 2stridey, bs[0][4], luma_qp[1], a, b, 0, luma_intra_deblock ); deblock_edge_intra( h, pixuv + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock ); - deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock ); + if( chroma444 ) + deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock ); } else { deblock_edge( h, pixy + (stridey<<offy), 2stridey, bs[0][4], luma_qp[1], a, b, 0, luma_deblock ); deblock_edge( h, pixuv + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock ); - deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock ); + if( chroma444 ) + deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock ); } } else @@ -647,6 +639,8 @@ void x264_deblock_v_chroma_avx ( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_h_chroma_sse2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_h_chroma_avx ( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); +void x264_deblock_h_chroma_mbaff_sse2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); +void x264_deblock_h_chroma_mbaff_avx ( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_h_chroma_422_mmx2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_h_chroma_422_sse2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_h_chroma_422_avx ( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); @@ -658,6 +652,9 @@ void x264_deblock_v_chroma_intra_avx ( pixel pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_sse2( pixel pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_avx ( pixel pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_mmx2( pixel pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_sse2( pixel pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_avx ( pixel pix, int stride, int alpha, int beta ); void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); @@ -670,15 +667,21 @@ void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); + +void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_sse2( pixel pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_avx ( pixel pix, int stride, int alpha, int beta ); #if ARCH_X86 void x264_deblock_h_luma_mmx2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_v8_luma_mmx2( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_v_chroma_mmx2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_h_chroma_mmx2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); +void x264_deblock_h_chroma_mbaff_mmx2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_h_luma_intra_mmx2( pixel pix, int stride, int alpha, int beta ); void x264_deblock_v8_luma_intra_mmx2( uint8_t pix, int stride, int alpha, int beta ); void x264_deblock_v_chroma_intra_mmx2( pixel pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_mmx2( pixel pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel pix, int stride, int alpha, int beta ); #if HIGH_BIT_DEPTH void x264_deblock_v_luma_mmx2( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); @@ -725,10 +728,8 @@ pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c; pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c; pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c; - pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c; pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c; pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c; - pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c; pf->deblock_strength = deblock_strength_c; #if HAVE_MMX @@ -739,22 +740,26 @@ pf->deblock_luma[0] = x264_deblock_h_luma_mmx2; pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2; -#if !HIGH_BIT_DEPTH + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2; pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2; -#endif + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2; + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; +#endif +#if !HIGH_BIT_DEPTH + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; #endif pf->deblock_strength = x264_deblock_strength_mmx2; if( cpu&X264_CPU_SSE2 ) { pf->deblock_strength = x264_deblock_strength_sse2; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2; -#if !HIGH_BIT_DEPTH pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2; -#endif + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2; + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_luma[1] = x264_deblock_v_luma_sse2; @@ -764,6 +769,9 @@ pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2; +#if HIGH_BIT_DEPTH + pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2; +#endif } } if( cpu&X264_CPU_SSSE3 ) @@ -772,9 +780,8 @@ { pf->deblock_strength = x264_deblock_strength_avx; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx; -#if !HIGH_BIT_DEPTH pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx; -#endif + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_luma[1] = x264_deblock_v_luma_avx; @@ -784,6 +791,10 @@ pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx; +#if HIGH_BIT_DEPTH + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx; + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx; +#endif } } } @@ -808,4 +819,8 @@ } #endif #endif // !HIGH_BIT_DEPTH + + / These functions are equivalent, so don't duplicate them. */ + pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420; + pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra; }
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/frame.c ^
@@ -353,6 +353,7 @@ dst->param = src->param; dst->i_pic_struct = src->i_pic_struct; dst->extra_sei = src->extra_sei; + dst->opaque = src->opaque; uint8_t *pix[3]; int stride[3];
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/frame.h ^
@@ -162,6 +162,9 @@ /* user sei / x264_sei_t extra_sei; + + / user data / + void opaque; } x264_frame_t; /* synchronized frame list */
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/mc.c ^
@@ -304,9 +304,9 @@ } } -void x264_plane_copy_deinterleave_c( pixel dstu, int i_dstu, - pixel dstv, int i_dstv, - pixel src, int i_src, int w, int h ) +static void x264_plane_copy_deinterleave_c( pixel dstu, int i_dstu, + pixel dstv, int i_dstv, + pixel src, int i_src, int w, int h ) { for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src ) for( int x=0; x<w; x++ ) @@ -316,10 +316,10 @@ } } -void x264_plane_copy_deinterleave_rgb_c( pixel dsta, int i_dsta, - pixel dstb, int i_dstb, - pixel dstc, int i_dstc, - pixel src, int i_src, int pw, int w, int h ) +static void x264_plane_copy_deinterleave_rgb_c( pixel dsta, int i_dsta, + pixel dstb, int i_dstb, + pixel dstc, int i_dstc, + pixel src, int i_src, int pw, int w, int h ) { for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src ) {
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/osdep.h ^
@@ -31,15 +31,10 @@ #define _FILE_OFFSET_BITS 64 #include <stdio.h> #include <sys/stat.h> +#include <inttypes.h> #include "config.h" -#if HAVE_STDINT_H -#include <stdint.h> -#else -#include <inttypes.h> -#endif - #if !HAVE_LOG2F #define log2f(x) (logf(x)/0.693147180559945f) #define log2(x) (log(x)/0.693147180559945)
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/pixel.c ^
@@ -522,8 +522,6 @@ INTRA_MBCMP_8x8(sa8d,, _c ) #if HIGH_BIT_DEPTH && HAVE_MMX INTRA_MBCMP_8x8( sad, _mmx2, _c ) -INTRA_MBCMP_8x8( sad, _sse2, _sse2 ) -INTRA_MBCMP_8x8( sad, _ssse3, _sse2 ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif @@ -550,14 +548,10 @@ #if HAVE_MMX #if HIGH_BIT_DEPTH INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c ) -INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c ) -INTRA_MBCMP(satd, 8x8, dc, h, v, c, _mmx2, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 ) -INTRA_MBCMP(satd, 16x16, v, h, dc, , _mmx2, _mmx2 ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 ) -INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 ) #else @@ -865,6 +859,7 @@ #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; @@ -872,6 +867,7 @@ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { @@ -884,7 +880,7 @@ { INIT4( hadamard_ac, _sse2 ); } - + pixf->vsad = x264_pixel_vsad_sse2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; @@ -911,7 +907,8 @@ { INIT4( hadamard_ac, _ssse3 ); } - + pixf->vsad = x264_pixel_vsad_ssse3; + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; @@ -935,6 +932,7 @@ { INIT4( hadamard_ac, _avx ); } + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx; @@ -943,6 +941,10 @@ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; } + if( cpu&X264_CPU_XOP ) + { + pixf->vsad = x264_pixel_vsad_xop; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/quant.c ^
@@ -373,14 +373,17 @@ {\ int i_last = runlevel->last = x264_coeff_last##num(dct);\ int i_total = 0;\ + int mask = 0;\ do\ {\ int r = 0;\ runlevel->level[i_total] = dct[i_last];\ + mask \|= 1 << (i_last);\ while( --i_last >= 0 && dct[i_last] == 0 )\ r++;\ runlevel->run[i_total++] = r;\ } while( i_last >= 0 );\ + runlevel->mask = mask;\ return i_total;\ } @@ -389,6 +392,18 @@ level_run(15) level_run(16) +#if ARCH_X86_64 +#define INIT_TRELLIS(cpu)\ + pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\ + pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\ + pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\ + pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\ + pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\ + pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu; +#else +#define INIT_TRELLIS(...) +#endif + void x264_quant_init( x264_t h, int cpu, x264_quant_function_t pf ) { pf->quant_8x8 = quant_8x8; @@ -423,6 +438,7 @@ #if HIGH_BIT_DEPTH #if HAVE_MMX + INIT_TRELLIS( sse2 ); if( cpu&X264_CPU_MMX2 ) { #if ARCH_X86 @@ -500,6 +516,7 @@ pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz; } pf->decimate_score64 = x264_decimate_score64_ssse3; + INIT_TRELLIS( ssse3 ); } if( cpu&X264_CPU_SSE4 ) { @@ -524,6 +541,7 @@ #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX + INIT_TRELLIS( sse2 ); if( cpu&X264_CPU_MMX ) { #if ARCH_X86 @@ -627,6 +645,7 @@ pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz; } pf->decimate_score64 = x264_decimate_score64_ssse3; + INIT_TRELLIS( ssse3 ); } if( cpu&X264_CPU_SSE4 )
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/quant.h ^
@@ -55,6 +55,17 @@ int (coeff_level_run[13])( dctcoef dct, x264_run_level_t runlevel ); int (coeff_level_run4)( dctcoef dct, x264_run_level_t runlevel ); int (coeff_level_run8)( dctcoef dct, x264_run_level_t runlevel ); + +#define TRELLIS_PARAMS const int unquant_mf, const uint8_t zigzag, int lambda2,\ + int last_nnz, dctcoef coefs, dctcoef quant_coefs, dctcoef dct,\ + uint8_t cabac_state_sig, uint8_t cabac_state_last,\ + uint64_t level_state0, uint16_t level_state1 + int (trellis_cabac_4x4)( TRELLIS_PARAMS, int b_ac ); + int (trellis_cabac_8x8)( TRELLIS_PARAMS, int b_interlaced ); + int (trellis_cabac_4x4_psy)( TRELLIS_PARAMS, int b_ac, dctcoef fenc_dct, int psy_trellis ); + int (trellis_cabac_8x8_psy)( TRELLIS_PARAMS, int b_interlaced, dctcoef fenc_dct, int psy_trellis ); + int (trellis_cabac_dc)( TRELLIS_PARAMS, int num_coefs ); + int (trellis_cabac_chroma_422_dc)( TRELLIS_PARAMS ); } x264_quant_function_t; void x264_quant_init( x264_t h, int cpu, x264_quant_function_t pf );
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/rectangle.c ^
@@ -26,7 +26,7 @@ #include "common.h" #define CACHE_FUNC(name,size,width,height)\ -void x264_macroblock_cache_##name##_##width##_##height( void target, uint32_t val )\ +static void x264_macroblock_cache_##name##_##width##_##height( void target, uint32_t val )\ {\ x264_macroblock_cache_rect( target, width*size, height, size, val );\ }
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/set.c ^
@@ -112,9 +112,15 @@ !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], sizesizeof(uint8_t) ) ) break; if( j < i ) + { h->quant4_bias[i] = h->quant4_bias[j]; + h->quant4_bias0[i] = h->quant4_bias0[j]; + } else + { CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)sizesizeof(udctcoef) ); + CHECKED_MALLOC( h->quant4_bias0[i], (QP_MAX+1)sizesizeof(udctcoef) ); + } } for( int q = 0; q < 6; q++ ) @@ -163,6 +169,7 @@ } // round to nearest, unless that would cause the deadzone to be negative h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); + h->quant4_bias0[i_list][q][i] = (1<<15)/j; if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY \|\| i_list == CQM_4PY) ) max_qp_err = q; if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_4IC \|\| i_list == CQM_4PC) ) @@ -182,6 +189,7 @@ continue; } h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); + h->quant8_bias0[i_list][q][i] = (1<<15)/j; if( j > 0xffff && q > max_qp_err && (i_list == CQM_8IY \|\| i_list == CQM_8PY) ) max_qp_err = q; if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_8IC \|\| i_list == CQM_8PC) ) @@ -272,7 +280,10 @@ if( h->quant##n##_bias[i] == h->quant##n##_bias[j] )\ break;\ if( j == i )\ + {\ x264_free( h->quant##n##_bias[i] );\ + x264_free( h->quant##n##_bias0[i] );\ + }\ } void x264_cqm_delete( x264_t h ) @@ -351,8 +362,8 @@ b_error \|= x264_cqm_parse_jmlist( h, buf, "INTER8X8_LUMA", h->param.cqm_8py, x264_cqm_jvt8p, 64 ); if( CHROMA444 ) { - b_error \|= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8iy, x264_cqm_jvt8i, 64 ); - b_error \|= x264_cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8py, x264_cqm_jvt8p, 64 ); + b_error \|= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8ic, x264_cqm_jvt8i, 64 ); + b_error \|= x264_cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8pc, x264_cqm_jvt8p, 64 ); } x264_free( buf );
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/vlc.c ^
@@ -738,7 +738,7 @@ }; /* [MIN( i_zero_left-1, 6 )][run_before] / -const vlc_t x264_run_before[7][16] = +static const vlc_t run_before[7][16] = { { / i_zero_left 1 / { 0x1, 1 }, / str=1 / @@ -799,8 +799,9 @@ }; vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; +uint32_t x264_run_before[1<<16]; -void x264_cavlc_init( void ) +void x264_cavlc_init( x264_t h ) { for( int i_suffix = 0; i_suffix < 7; i_suffix++ ) for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ ) @@ -840,4 +841,27 @@ i_next++; vlc->i_next = i_next; } + + for( int i = 1; i < (1<<16); i++ ) + { + x264_run_level_t runlevel; + ALIGNED_ARRAY_16( dctcoef, dct, [16] ); + int size = 0; + int bits = 0; + for( int j = 0; j < 16; j++ ) + dct[j] = i&(1<<j); + int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel ); + int zeros = runlevel.last + 1 - total; + for( int j = 0; j < total-1 && zeros > 0; j++ ) + { + int idx = X264_MIN(zeros, 7) - 1; + int run = runlevel.run[j]; + int len = run_before[idx][run].i_size; + size += len; + bits <<= len; + bits \|= run_before[idx][run].i_bits; + zeros -= run; + } + x264_run_before[i] = (bits << 5) + size; + } }
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/cabac-a.asm ^
@@ -35,7 +35,7 @@ ; t3 must be ecx, since it's used for shift. %ifdef WIN64 - DECLARE_REG_TMP 3,1,2,0,4,5,6,2 + DECLARE_REG_TMP 3,1,2,0,6,5,4,2 %define pointer resq %elifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,6,6 @@ -61,11 +61,11 @@ %macro LOAD_GLOBAL 4 %ifdef PIC ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea - lea r11, [%2] + lea r7, [%2] %ifnidn %3, 0 - add r11, %3 + add r7, %3 %endif - movzx %1, byte [r11+%4] + movzx %1, byte [r7+%4] %else movzx %1, byte [%2+%3+%4] %endif @@ -81,6 +81,9 @@ and t4d, t6d shr t5d, 6 movifnidn t2d, r2m +%ifdef WIN64 + PUSH r7 +%endif LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t42 LOAD_GLOBAL t4d, cabac_transition, t2, t62 and t6d, 1 @@ -95,6 +98,9 @@ mov t4d, t3d shr t3d, 3 LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3 +%ifdef WIN64 + POP r7 +%endif shl t4d, t3b shl t6d, t3b mov [t0+cb.range], t4d @@ -144,12 +150,11 @@ PROLOGUE 0,7 mov t3d, [t0+cb.queue] mov t6d, [t0+cb.low] - jmp cabac_putbyte cabac_putbyte: ; alive: t0=cb t3=queue t6=low %ifdef WIN64 - DECLARE_REG_TMP 3,4,1,0,2,5,6,10 + DECLARE_REG_TMP 3,6,1,0,2,5,4 %endif mov t1d, -1 add t3d, 10
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/const-a.asm ^
@@ -38,6 +38,7 @@ const pw_1, times 8 dw 1 const pw_2, times 8 dw 2 +const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 const pw_16, times 8 dw 16
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/dct-32.asm ^
@@ -32,120 +32,13 @@ SECTION .text -%ifndef HIGH_BIT_DEPTH +cextern pd_32 +cextern pw_pixel_max +cextern pw_2 +cextern pw_m2 cextern pw_32 cextern hsub_mul -; in: m0..m7 -; out: 0,4,6 in mem, rest in regs -%macro DCT8_1D 9 - SUMSUB_BA w, %8, %1 ; %8 = s07, %1 = d07 - SUMSUB_BA w, %7, %2 ; %7 = s16, %2 = d16 - SUMSUB_BA w, %6, %3 ; %6 = s25, %3 = d25 - SUMSUB_BA w, %5, %4 ; %5 = s34, %4 = d34 - SUMSUB_BA w, %5, %8 ; %5 = a0, %8 = a2 - SUMSUB_BA w, %6, %7 ; %6 = a1, %7 = a3 - SUMSUB_BA w, %6, %5 ; %6 = dst0, %5 = dst4 - mova [%9+0x00], m%6 - mova [%9+0x40], m%5 - psraw m%6, m%7, 1 ; a3>>1 - paddw m%6, m%8 ; a2 + (a3>>1) - psraw m%8, 1 ; a2>>1 - psubw m%8, m%7 ; (a2>>1) - a3 - mova [%9+0x60], m%8 - psraw m%5, m%3, 1 - paddw m%5, m%3 ; d25+(d25>>1) - psubw m%7, m%1, m%4 ; a5 = d07-d34-(d25+(d25>>1)) - psubw m%7, m%5 - psraw m%5, m%2, 1 - paddw m%5, m%2 ; d16+(d16>>1) - paddw m%8, m%1, m%4 - psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1)) - psraw m%5, m%1, 1 - paddw m%5, m%1 ; d07+(d07>>1) - paddw m%5, m%2 - paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1)) - psraw m%1, m%4, 1 - paddw m%1, m%4 ; d34+(d34>>1) - paddw m%1, m%2 - psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1)) - psraw m%4, m%1, 2 - paddw m%4, m%5 ; a4 + (a7>>2) - psraw m%3, m%8, 2 - paddw m%3, m%7 ; a5 + (a6>>2) - psraw m%5, 2 - psraw m%7, 2 - psubw m%5, m%1 ; (a4>>2) - a7 - psubw m%8, m%7 ; a6 - (a5>>2) - SWAP %2, %4, %3, %6, %8, %5 -%endmacro - -; in: 0,4 in mem, rest in regs -; out: m0..m7 -%macro IDCT8_1D 9 - psraw m%1, m%3, 1 - psraw m%5, m%7, 1 - psubw m%1, m%7 - paddw m%5, m%3 - psraw m%7, m%2, 1 - paddw m%7, m%2 - paddw m%7, m%4 - paddw m%7, m%6 - psraw m%3, m%6, 1 - paddw m%3, m%6 - paddw m%3, m%8 - psubw m%3, m%2 - psubw m%2, m%4 - psubw m%6, m%4 - paddw m%2, m%8 - psubw m%6, m%8 - psraw m%4, 1 - psraw m%8, 1 - psubw m%2, m%4 - psubw m%6, m%8 - psraw m%4, m%7, 2 - psraw m%8, m%3, 2 - paddw m%4, m%6 - paddw m%8, m%2 - psraw m%6, 2 - psraw m%2, 2 - psubw m%7, m%6 - psubw m%2, m%3 - mova m%3, [%9+0x00] - mova m%6, [%9+0x40] - SUMSUB_BA w, %6, %3 - SUMSUB_BA w, %5, %6 - SUMSUB_BA w, %1, %3 - SUMSUB_BA w, %7, %5 - SUMSUB_BA w, %2, %1 - SUMSUB_BA w, %8, %3 - SUMSUB_BA w, %4, %6 - SWAP %1, %3 - SWAP %5, %7 - SWAP %1, %5, %6 - SWAP %3, %8, %7 -%endmacro - -INIT_MMX -ALIGN 16 -load_diff_4x8_mmx: - LOAD_DIFF m0, m7, none, [r1+0FENC_STRIDE], [r2+0FDEC_STRIDE] - LOAD_DIFF m1, m7, none, [r1+1FENC_STRIDE], [r2+1FDEC_STRIDE] - LOAD_DIFF m2, m7, none, [r1+2FENC_STRIDE], [r2+2FDEC_STRIDE] - LOAD_DIFF m3, m7, none, [r1+3FENC_STRIDE], [r2+3FDEC_STRIDE] - LOAD_DIFF m4, m7, none, [r1+4FENC_STRIDE], [r2+4FDEC_STRIDE] - LOAD_DIFF m5, m7, none, [r1+5FENC_STRIDE], [r2+5FDEC_STRIDE] - movq [r0], m0 - LOAD_DIFF m6, m7, none, [r1+6FENC_STRIDE], [r2+6FDEC_STRIDE] - LOAD_DIFF m7, m0, none, [r1+7FENC_STRIDE], [r2+7FDEC_STRIDE] - movq m0, [r0] - ret - -cglobal dct8_mmx - DCT8_1D 0,1,2,3,4,5,6,7,r0 - SAVE_MM_PERMUTATION - ret - %macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets %xdefine %%base %1 %rep %0/2 @@ -174,6 +67,227 @@ UNSPILL_SHUFFLE %1, %2, %2 %endmacro +; in: size, m0..m7 +; out: 0,4,6 in memory at %10,%11,%12, rest in regs +%macro DCT8_1D 12 + SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07 + SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16 + SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25 + SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34 + SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2 + SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3 + SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4 + mova %10, m%7 + mova %11, m%6 + psra%1 m%7, m%8, 1 ; a3>>1 + padd%1 m%7, m%9 ; a2 + (a3>>1) + psra%1 m%9, 1 ; a2>>1 + psub%1 m%9, m%8 ; (a2>>1) - a3 + mova %12, m%9 + psra%1 m%6, m%4, 1 + padd%1 m%6, m%4 ; d25+(d25>>1) + psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1)) + psub%1 m%8, m%6 + psra%1 m%6, m%3, 1 + padd%1 m%6, m%3 ; d16+(d16>>1) + padd%1 m%9, m%2, m%5 + psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1)) + psra%1 m%6, m%2, 1 + padd%1 m%6, m%2 ; d07+(d07>>1) + padd%1 m%6, m%3 + padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1)) + psra%1 m%2, m%5, 1 + padd%1 m%2, m%5 ; d34+(d34>>1) + padd%1 m%2, m%3 + psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1)) + psra%1 m%5, m%2, 2 + padd%1 m%5, m%6 ; a4 + (a7>>2) + psra%1 m%4, m%9, 2 + padd%1 m%4, m%8 ; a5 + (a6>>2) + psra%1 m%6, 2 + psra%1 m%8, 2 + psub%1 m%6, m%2 ; (a4>>2) - a7 + psub%1 m%9, m%8 ; a6 - (a5>>2) + SWAP %3, %5, %4, %7, %9, %6 +%endmacro + +; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11 +; out: m0..m7 +%macro IDCT8_1D 11 + psra%1 m%2, m%4, 1 + psra%1 m%6, m%8, 1 + psub%1 m%2, m%8 + padd%1 m%6, m%4 + psra%1 m%8, m%3, 1 + padd%1 m%8, m%3 + padd%1 m%8, m%5 + padd%1 m%8, m%7 + psra%1 m%4, m%7, 1 + padd%1 m%4, m%7 + padd%1 m%4, m%9 + psub%1 m%4, m%3 + psub%1 m%3, m%5 + psub%1 m%7, m%5 + padd%1 m%3, m%9 + psub%1 m%7, m%9 + psra%1 m%5, 1 + psra%1 m%9, 1 + psub%1 m%3, m%5 + psub%1 m%7, m%9 + psra%1 m%5, m%8, 2 + psra%1 m%9, m%4, 2 + padd%1 m%5, m%7 + padd%1 m%9, m%3 + psra%1 m%7, 2 + psra%1 m%3, 2 + psub%1 m%8, m%7 + psub%1 m%3, m%4 + mova m%4, %10 + mova m%7, %11 + SUMSUB_BA %1, %7, %4 + SUMSUB_BA %1, %6, %7 + SUMSUB_BA %1, %2, %4 + SUMSUB_BA %1, %8, %6 + SUMSUB_BA %1, %3, %2 + SUMSUB_BA %1, %9, %4 + SUMSUB_BA %1, %5, %7 + SWAP %2, %4 + SWAP %6, %8 + SWAP %2, %6, %7 + SWAP %4, %9, %8 +%endmacro + +%ifdef HIGH_BIT_DEPTH + +%macro SUB8x8_DCT8 0 +cglobal sub8x8_dct8, 3,3,8 +global current_function %+ .skip_prologue +.skip_prologue: + LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2 + LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2 + + DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50] + mova m0, [r0] + + mova [r0+0x30], m5 + mova [r0+0x70], m7 + TRANSPOSE4x4W 0,1,2,3,4 + WIDEN_SXWD 0,4 + WIDEN_SXWD 1,5 + WIDEN_SXWD 2,6 + WIDEN_SXWD 3,7 + DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0] + mova [r0+0x20], m4 + mova [r0+0x40], m1 + mova [r0+0x60], m5 + mova [r0+0xA0], m6 + mova [r0+0xE0], m7 + mova m4, [r0+0x10] + mova m5, [r0+0x30] + mova m6, [r0+0x50] + mova m7, [r0+0x70] + + TRANSPOSE4x4W 4,5,6,7,0 + WIDEN_SXWD 4,0 + WIDEN_SXWD 5,1 + WIDEN_SXWD 6,2 + WIDEN_SXWD 7,3 + DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0] + mova [r0+0x30], m0 + mova [r0+0x50], m5 + mova [r0+0x70], m1 + mova [r0+0xB0], m2 + mova [r0+0xF0], m3 + ret +%endmacro ; SUB8x8_DCT8 + +INIT_XMM sse2 +SUB8x8_DCT8 +INIT_XMM sse4 +SUB8x8_DCT8 +INIT_XMM avx +SUB8x8_DCT8 + +%macro ADD8x8_IDCT8 0 +cglobal add8x8_idct8, 2,2 + add r1, 128 +global current_function %+ .skip_prologue +.skip_prologue: + UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6 + IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0] + mova [r1+0], m4 + TRANSPOSE4x4D 0,1,2,3,4 + paddd m0, [pd_32] + mova m4, [r1+0] + SPILL_SHUFFLE r1, 0,1,2,3, -8,-6,-4,-2 + TRANSPOSE4x4D 4,5,6,7,3 + paddd m4, [pd_32] + SPILL_SHUFFLE r1, 4,5,6,7, 0,2,4,6 + UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7 + IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16] + mova [r1+16], m4 + TRANSPOSE4x4D 0,1,2,3,4 + mova m4, [r1+16] + mova [r1-112], m0 + TRANSPOSE4x4D 4,5,6,7,0 + SPILL_SHUFFLE r1, 4,5,6,7, 1,3,5,7 + UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2 + IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112] + SPILL_SHUFFLE r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1 + UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7 + IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16] + SPILL_SHUFFLE r1, 7,6,5, 7,6,5 + mova m7, [pw_pixel_max] + pxor m6, m6 + mova m5, [r1-128] + STORE_DIFF m5, m0, m6, m7, [r0+0FDEC_STRIDEB] + mova m0, [r1-112] + STORE_DIFF m0, m1, m6, m7, [r0+1FDEC_STRIDEB] + mova m0, [r1-96] + STORE_DIFF m0, m2, m6, m7, [r0+2FDEC_STRIDEB] + mova m0, [r1-80] + STORE_DIFF m0, m3, m6, m7, [r0+3FDEC_STRIDEB] + mova m0, [r1-64] + STORE_DIFF m0, m4, m6, m7, [r0+4FDEC_STRIDEB] + mova m0, [r1-48] + mova m1, [r1+80] + STORE_DIFF m0, m1, m6, m7, [r0+5FDEC_STRIDEB] + mova m0, [r1-32] + mova m1, [r1+96] + STORE_DIFF m0, m1, m6, m7, [r0+6FDEC_STRIDEB] + mova m0, [r1-16] + mova m1, [r1+112] + STORE_DIFF m0, m1, m6, m7, [r0+7FDEC_STRIDEB] + RET +%endmacro ; ADD8x8_IDCT8 + +INIT_XMM sse2 +ADD8x8_IDCT8 +INIT_XMM avx +ADD8x8_IDCT8 + +%else ; !HIGH_BIT_DEPTH + +INIT_MMX +ALIGN 16 +load_diff_4x8_mmx: + LOAD_DIFF m0, m7, none, [r1+0FENC_STRIDE], [r2+0FDEC_STRIDE] + LOAD_DIFF m1, m7, none, [r1+1FENC_STRIDE], [r2+1FDEC_STRIDE] + LOAD_DIFF m2, m7, none, [r1+2FENC_STRIDE], [r2+2FDEC_STRIDE] + LOAD_DIFF m3, m7, none, [r1+3FENC_STRIDE], [r2+3FDEC_STRIDE] + LOAD_DIFF m4, m7, none, [r1+4FENC_STRIDE], [r2+4FDEC_STRIDE] + LOAD_DIFF m5, m7, none, [r1+5FENC_STRIDE], [r2+5FDEC_STRIDE] + movq [r0], m0 + LOAD_DIFF m6, m7, none, [r1+6FENC_STRIDE], [r2+6FDEC_STRIDE] + LOAD_DIFF m7, m0, none, [r1+7FENC_STRIDE], [r2+7FDEC_STRIDE] + movq m0, [r0] + ret + +cglobal dct8_mmx + DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60] + SAVE_MM_PERMUTATION + ret + ;----------------------------------------------------------------------------- ; void sub8x8_dct8( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- @@ -223,7 +337,7 @@ ret cglobal idct8_mmx - IDCT8_1D 0,1,2,3,4,5,6,7,r1 + IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64] SAVE_MM_PERMUTATION ret @@ -383,11 +497,11 @@ LOAD_DIFF m7, m0, none, [r1+7FENC_STRIDE], [r2+3FDEC_STRIDE] UNSPILL r0, 0 %endif - DCT8_1D 0,1,2,3,4,5,6,7,r0 + DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60] UNSPILL r0, 0,4 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1 UNSPILL r0, 4 - DCT8_1D 0,1,2,3,4,5,6,7,r0 + DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60] SPILL r0, 1,2,3,5,7 ret %endmacro @@ -402,6 +516,8 @@ DCT_SUB8 INIT_XMM avx DCT_SUB8 +INIT_XMM xop +DCT_SUB8 ;----------------------------------------------------------------------------- ; void add8x8_idct( uint8_t pix, int16_t dct[4][4][4] ) @@ -456,12 +572,12 @@ global current_function %+ .skip_prologue .skip_prologue: UNSPILL r1, 1,2,3,5,6,7 - IDCT8_1D 0,1,2,3,4,5,6,7,r1 + IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64] SPILL r1, 6 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1 paddw m0, [pw_32] SPILL r1, 0 - IDCT8_1D 0,1,2,3,4,5,6,7,r1 + IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64] SPILL r1, 6,7 pxor m7, m7 DIFFx2 m0, m1, m6, m7, [r0-4FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/dct-64.asm ^
@@ -31,109 +31,231 @@ SECTION .text -%ifndef HIGH_BIT_DEPTH +cextern pd_32 +cextern pw_pixel_max +cextern pw_2 +cextern pw_m2 cextern pw_32 cextern hsub_mul -%macro DCT8_1D 10 - SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34 - SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25 - SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16 - SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07 - - SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3 - SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2 - - psraw m%9, m%1, 1 - paddw m%9, m%1 - paddw m%9, m%2 - paddw m%9, m%3 ; %9=a4 - - psraw m%10, m%4, 1 - paddw m%10, m%4 - paddw m%10, m%2 - psubw m%10, m%3 ; %10=a7 - - SUMSUB_BA w, %4, %1 - psubw m%1, m%3 - psubw m%4, m%2 - psraw m%3, 1 - psraw m%2, 1 - psubw m%1, m%3 ; %1=a5 - psubw m%4, m%2 ; %4=a6 - - psraw m%2, m%10, 2 - paddw m%2, m%9 ; %2=b1 - psraw m%9, 2 - psubw m%9, m%10 ; %9=b7 - - SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4 - - psraw m%3, m%7, 1 - paddw m%3, m%8 ; %3=b2 - psraw m%8, 1 - psubw m%8, m%7 ; %8=b6 - - psraw m%7, m%4, 2 - paddw m%7, m%1 ; %7=b3 - psraw m%1, 2 - psubw m%4, m%1 ; %4=b5 +; in: size, m0..m7, temp, temp +; out: m0..m7 +%macro DCT8_1D 11 + SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34 + SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25 + SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16 + SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07 + + SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3 + SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2 + + psra%1 m%10, m%2, 1 + padd%1 m%10, m%2 + padd%1 m%10, m%3 + padd%1 m%10, m%4 ; %10=a4 + + psra%1 m%11, m%5, 1 + padd%1 m%11, m%5 + padd%1 m%11, m%3 + psub%1 m%11, m%4 ; %11=a7 + + SUMSUB_BA %1, %5, %2 + psub%1 m%2, m%4 + psub%1 m%5, m%3 + psra%1 m%4, 1 + psra%1 m%3, 1 + psub%1 m%2, m%4 ; %2=a5 + psub%1 m%5, m%3 ; %5=a6 + + psra%1 m%3, m%11, 2 + padd%1 m%3, m%10 ; %3=b1 + psra%1 m%10, 2 + psub%1 m%10, m%11 ; %10=b7 + + SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4 + + psra%1 m%4, m%8, 1 + padd%1 m%4, m%9 ; %4=b2 + psra%1 m%9, 1 + psub%1 m%9, m%8 ; %9=b6 + + psra%1 m%8, m%5, 2 + padd%1 m%8, m%2 ; %8=b3 + psra%1 m%2, 2 + psub%1 m%5, m%2 ; %5=b5 - SWAP %1, %6, %4, %7, %8, %9 + SWAP %2, %7, %5, %8, %9, %10 %endmacro -%macro IDCT8_1D 10 - SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2 - - psraw m%9, m%2, 1 - paddw m%9, m%2 - paddw m%9, m%4 - paddw m%9, m%6 ; %9=a7 - - psraw m%10, m%3, 1 - psubw m%10, m%7 ; %10=a4 - psraw m%7, 1 - paddw m%7, m%3 ; %7=a6 - - psraw m%3, m%6, 1 - paddw m%3, m%6 - paddw m%3, m%8 - psubw m%3, m%2 ; %3=a5 - - psubw m%2, m%4 - psubw m%6, m%4 - paddw m%2, m%8 - psubw m%6, m%8 - psraw m%4, 1 - psraw m%8, 1 - psubw m%2, m%4 ; %2=a3 - psubw m%6, m%8 ; %6=a1 - - psraw m%4, m%9, 2 - paddw m%4, m%6 ; %4=b1 - psraw m%6, 2 - psubw m%9, m%6 ; %9=b7 - - SUMSUB_BA w, %7, %5, %6 ; %7=b0, %5=b6 - SUMSUB_BA w, %10, %1, %6 ; %10=b2, %1=b4 - - psraw m%8, m%3, 2 - paddw m%8, m%2 ; %8=b3 - psraw m%2, 2 - psubw m%2, m%3 ; %2=b5 - - SUMSUB_BA w, %9, %7, %6 ; %9=c0, %7=c7 - SUMSUB_BA w, %2, %10, %6 ; %2=c1, %10=c6 - SUMSUB_BA w, %8, %1, %6 ; %8=c2, %1=c5 - SUMSUB_BA w, %4, %5, %6 ; %4=c3, %5=c4 - - SWAP %10, %3 - SWAP %1, %9, %6 - SWAP %3, %8, %7 +%macro IDCT8_1D 11 + SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2 + + psra%1 m%10, m%3, 1 + padd%1 m%10, m%3 + padd%1 m%10, m%5 + padd%1 m%10, m%7 ; %9=a7 + + psra%1 m%11, m%4, 1 + psub%1 m%11, m%8 ; %10=a4 + psra%1 m%8, 1 + padd%1 m%8, m%4 ; %7=a6 + + psra%1 m%4, m%7, 1 + padd%1 m%4, m%7 + padd%1 m%4, m%9 + psub%1 m%4, m%3 ; %3=a5 + + psub%1 m%3, m%5 + psub%1 m%7, m%5 + padd%1 m%3, m%9 + psub%1 m%7, m%9 + psra%1 m%5, 1 + psra%1 m%9, 1 + psub%1 m%3, m%5 ; %2=a3 + psub%1 m%7, m%9 ; %6=a1 + + psra%1 m%5, m%10, 2 + padd%1 m%5, m%7 ; %4=b1 + psra%1 m%7, 2 + psub%1 m%10, m%7 ; %9=b7 + + SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6 + SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4 + + psra%1 m%9, m%4, 2 + padd%1 m%9, m%3 ; %8=b3 + psra%1 m%3, 2 + psub%1 m%3, m%4 ; %2=b5 + + SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7 + SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6 + SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5 + SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4 + + SWAP %11, %4 + SWAP %2, %10, %7 + SWAP %4, %9, %8 %endmacro +%ifdef HIGH_BIT_DEPTH + +%macro SUB8x8_DCT8 0 +cglobal sub8x8_dct8, 3,3,14 +%ifdef WIN64 + call .skip_prologue + RET +%endif +global current_function %+ .skip_prologue +.skip_prologue: + LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2 + LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2 + + DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9 + + TRANSPOSE4x4W 0,1,2,3,8 + WIDEN_SXWD 0,8 + WIDEN_SXWD 1,9 + WIDEN_SXWD 2,10 + WIDEN_SXWD 3,11 + DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13 + mova [r0+0x00], m0 + mova [r0+0x20], m8 + mova [r0+0x40], m1 + mova [r0+0x60], m9 + mova [r0+0x80], m2 + mova [r0+0xA0], m10 + mova [r0+0xC0], m3 + mova [r0+0xE0], m11 + + TRANSPOSE4x4W 4,5,6,7,0 + WIDEN_SXWD 4,0 + WIDEN_SXWD 5,1 + WIDEN_SXWD 6,2 + WIDEN_SXWD 7,3 + DCT8_1D d,4,0,5,1,6,2,7,3, 8,9 + mova [r0+0x10], m4 + mova [r0+0x30], m0 + mova [r0+0x50], m5 + mova [r0+0x70], m1 + mova [r0+0x90], m6 + mova [r0+0xB0], m2 + mova [r0+0xD0], m7 + mova [r0+0xF0], m3 + ret +%endmacro ; SUB8x8_DCT8 + +INIT_XMM sse2 +SUB8x8_DCT8 +INIT_XMM sse4 +SUB8x8_DCT8 +INIT_XMM avx +SUB8x8_DCT8 + +%macro ADD8x8_IDCT8 0 +cglobal add8x8_idct8, 2,2,16 + add r1, 128 +%ifdef WIN64 + call .skip_prologue + RET +%endif +global current_function %+ .skip_prologue +.skip_prologue: + mova m0, [r1-128] + mova m1, [r1-96] + mova m2, [r1-64] + mova m3, [r1-32] + mova m4, [r1+ 0] + mova m5, [r1+32] + mova m6, [r1+64] + mova m7, [r1+96] + IDCT8_1D d,0,1,2,3,4,5,6,7,8,9 + TRANSPOSE4x4D 0,1,2,3,8 + TRANSPOSE4x4D 4,5,6,7,8 + paddd m0, [pd_32] + paddd m4, [pd_32] + mova [r1+64], m6 + mova [r1+96], m7 + mova m8, [r1-112] + mova m9, [r1-80] + mova m10, [r1-48] + mova m11, [r1-16] + mova m12, [r1+16] + mova m13, [r1+48] + mova m14, [r1+80] + mova m15, [r1+112] + IDCT8_1D d,8,9,10,11,12,13,14,15,6,7 + TRANSPOSE4x4D 8,9,10,11,6 + TRANSPOSE4x4D 12,13,14,15,6 + IDCT8_1D d,0,1,2,3,8,9,10,11,6,7 + mova [r1-112], m8 + mova [r1-80], m9 + mova m6, [r1+64] + mova m7, [r1+96] + IDCT8_1D d,4,5,6,7,12,13,14,15,8,9 + pxor m8, m8 + mova m9, [pw_pixel_max] + STORE_DIFF m0, m4, m8, m9, [r0+0FDEC_STRIDEB] + STORE_DIFF m1, m5, m8, m9, [r0+1FDEC_STRIDEB] + STORE_DIFF m2, m6, m8, m9, [r0+2FDEC_STRIDEB] + STORE_DIFF m3, m7, m8, m9, [r0+3FDEC_STRIDEB] + mova m0, [r1-112] + mova m1, [r1-80] + STORE_DIFF m0, m12, m8, m9, [r0+4FDEC_STRIDEB] + STORE_DIFF m1, m13, m8, m9, [r0+5FDEC_STRIDEB] + STORE_DIFF m10, m14, m8, m9, [r0+6FDEC_STRIDEB] + STORE_DIFF m11, m15, m8, m9, [r0+7FDEC_STRIDEB] + ret +%endmacro ; ADD8x8_IDCT8 + +INIT_XMM sse2 +ADD8x8_IDCT8 +INIT_XMM avx +ADD8x8_IDCT8 + +%else ; !HIGH_BIT_DEPTH + %macro DCT_SUB8 0 -cglobal sub8x8_dct, 3,3,11 +cglobal sub8x8_dct, 3,3,10 add r2, 4FDEC_STRIDE %if cpuflag(ssse3) mova m7, [hsub_mul] @@ -174,9 +296,9 @@ SWAP 7, 10 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4FDEC_STRIDE LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4FDEC_STRIDE - DCT8_1D 0,1,2,3,4,5,6,7,8,9 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 - DCT8_1D 0,1,2,3,4,5,6,7,8,9 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 movdqa [r0+0x00], m0 movdqa [r0+0x10], m1 movdqa [r0+0x20], m2 @@ -198,6 +320,8 @@ DCT_SUB8 INIT_XMM avx DCT_SUB8 +INIT_XMM xop +DCT_SUB8 ;----------------------------------------------------------------------------- ; void add8x8_idct8( uint8_t p_dst, int16_t dct[8][8] ) @@ -221,10 +345,10 @@ movdqa m5, [r1+0x50] movdqa m6, [r1+0x60] movdqa m7, [r1+0x70] - IDCT8_1D 0,1,2,3,4,5,6,7,8,10 + IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 paddw m0, [pw_32] ; rounding for the >>6 at the end - IDCT8_1D 0,1,2,3,4,5,6,7,8,10 + IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 DIFFx2 m0, m1, m8, m9, [r0-4FDEC_STRIDE], [r0-3FDEC_STRIDE] DIFFx2 m2, m3, m8, m9, [r0-2FDEC_STRIDE], [r0-1FDEC_STRIDE] DIFFx2 m4, m5, m8, m9, [r0+0FDEC_STRIDE], [r0+1FDEC_STRIDE]
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/dct-a.asm ^
@@ -31,6 +31,7 @@ %include "x86util.asm" SECTION_RODATA +pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1 @@ -52,6 +53,8 @@ cextern pw_1 cextern pd_1 cextern pd_32 +cextern pw_ppppmmmm +cextern pw_pmpmpmpm %macro WALSH4_1D 6 SUMSUB_BADC %1, %5, %4, %3, %2, %6 @@ -352,8 +355,8 @@ ;----------------------------------------------------------------------------- ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- -%macro SUB_NxN_DCT 6 -cglobal %1, 3,3,11 +%macro SUB_NxN_DCT 7 +cglobal %1, 3,3,%7 %ifndef HIGH_BIT_DEPTH %if mmsize == 8 pxor m7, m7 @@ -363,9 +366,6 @@ %endif %endif ; !HIGH_BIT_DEPTH .skip_prologue: -%ifdef WIN64 - sub rsp, 8 -%endif call %2.skip_prologue add r0, %3 add r1, %4-%5-%6FENC_STRIDE @@ -380,7 +380,6 @@ add r2, %4-%5-%6FDEC_STRIDE %ifdef WIN64 call %2.skip_prologue - add rsp, 8 RET %else jmp %2.skip_prologue @@ -392,18 +391,18 @@ ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6-7 %ifdef HIGH_BIT_DEPTH -cglobal %1, 2,2,6 +cglobal %1, 2,2,%7 +%if %3==256 + add r1, 128 +%endif %else cglobal %1, 2,2,11 pxor m7, m7 %endif -%if mmsize==16 +%if mmsize==16 && %3!=256 add r0, 4FDEC_STRIDE %endif .skip_prologue: -%ifdef WIN64 - sub rsp, 8 -%endif call %2.skip_prologue add r0, %4-%5-%6FDEC_STRIDE add r1, %3 @@ -415,7 +414,6 @@ add r1, %3 %ifdef WIN64 call %2.skip_prologue - add rsp, 8 RET %else jmp %2.skip_prologue @@ -424,24 +422,34 @@ %ifdef HIGH_BIT_DEPTH INIT_MMX -SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0 -SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8 +SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0 +SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0 INIT_XMM -ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0 -ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8 -ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0 -ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8 +ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6 +ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6 +ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6 +ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6 +cextern add8x8_idct8_sse2.skip_prologue +cextern add8x8_idct8_avx.skip_prologue +ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16 +ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16 +cextern sub8x8_dct8_sse2.skip_prologue +cextern sub8x8_dct8_sse4.skip_prologue +cextern sub8x8_dct8_avx.skip_prologue +SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14 +SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14 +SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14 %else ; !HIGH_BIT_DEPTH %ifndef ARCH_X86_64 INIT_MMX -SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0 +SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0 -SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4 +SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4 cextern sub8x8_dct8_mmx.skip_prologue cextern add8x8_idct8_mmx.skip_prologue -SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0 +SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0 %endif @@ -449,9 +457,11 @@ cextern sub8x8_dct_sse2.skip_prologue cextern sub8x8_dct_ssse3.skip_prologue cextern sub8x8_dct_avx.skip_prologue -SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0 -SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0 -SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0 +cextern sub8x8_dct_xop.skip_prologue +SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10 +SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10 +SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10 +SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10 cextern add8x8_idct_sse2.skip_prologue cextern add8x8_idct_avx.skip_prologue @@ -466,9 +476,9 @@ cextern sub8x8_dct8_sse2.skip_prologue cextern sub8x8_dct8_ssse3.skip_prologue cextern sub8x8_dct8_avx.skip_prologue -SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0 -SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0 -SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0 +SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11 +SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11 +SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11 %endif ; HIGH_BIT_DEPTH %ifdef HIGH_BIT_DEPTH @@ -727,11 +737,11 @@ ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- -%macro DCTDC_2ROW_MMX 3 +%macro DCTDC_2ROW_MMX 4 movq %1, [r1+FENC_STRIDE(0+%3)] movq m1, [r1+FENC_STRIDE(1+%3)] - movq m2, [r2+FDEC_STRIDE(0+%3)] - movq m3, [r2+FDEC_STRIDE(1+%3)] + movq m2, [r2+FDEC_STRIDE(0+%4)] + movq m3, [r2+FDEC_STRIDE(1+%4)] movq %2, %1 punpckldq %1, m1 punpckhdq %2, m1 @@ -747,30 +757,29 @@ psubw %2, m1 %endmacro -%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2 - pshufw mm1, %1, q2200 ; s1 s1 s0 s0 - pshufw mm0, %2, q2301 ; s3 __ s2 __ - paddw mm1, %2 ; s1 s13 s0 s02 - psubw mm1, mm0 ; d13 s13 d02 s02 - pshufw mm0, mm1, q1010 ; d02 s02 d02 s02 - psrlq mm1, 32 ; __ __ d13 s13 - paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13 - psllq mm1, 32 ; d13 s13 - psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13 +%macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1) + PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0 + PSHUFLW m0, %2, q2301 ; s3 __ s2 __ + paddw m1, %2 ; s1 s13 s0 s02 + psubw m1, m0 ; d13 s13 d02 s02 + PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02 + psrlq m1, 32 ; __ __ d13 s13 + paddw m0, m1 ; d02 s02 d02+d13 s02+s13 + psllq m1, 32 ; d13 s13 + psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13 %endmacro %ifndef HIGH_BIT_DEPTH INIT_MMX cglobal sub8x8_dct_dc_mmx2, 3,3 - DCTDC_2ROW_MMX m0, m4, 0 - DCTDC_2ROW_MMX m5, m6, 2 + DCTDC_2ROW_MMX m0, m4, 0, 0 + DCTDC_2ROW_MMX m5, m6, 2, 2 paddw m0, m5 paddw m4, m6 punpckldq m0, m4 - add r1, FENC_STRIDE4 add r2, FDEC_STRIDE4 - DCTDC_2ROW_MMX m7, m4, 0 - DCTDC_2ROW_MMX m5, m6, 2 + DCTDC_2ROW_MMX m7, m4, 4, 0 + DCTDC_2ROW_MMX m5, m6, 6, 2 paddw m7, m5 paddw m4, m6 punpckldq m7, m4 @@ -779,43 +788,151 @@ ret INIT_XMM -%macro DCTDC_2ROW_SSE2 3 - movq m0, [r1+FENC_STRIDE(0+%1)] - movq m1, [r1+FENC_STRIDE(1+%1)] - movq m2, [r2+FDEC_STRIDE(0+%1)] - movq m3, [r2+FDEC_STRIDE(1+%1)] - punpckldq m0, m1 - punpckldq m2, m3 - psadbw m0, m7 - psadbw m2, m7 -%if %2 - paddw %3, m0 - paddw m6, m2 +%macro DCTDC_2ROW_SSE2 4 + movq m1, [r1+FENC_STRIDE(0+%1)] + movq m2, [r1+FENC_STRIDE(1+%1)] + punpckldq m1, m2 + movq m2, [r2+FDEC_STRIDE(0+%2)] + punpckldq m2, [r2+FDEC_STRIDE(1+%2)] + psadbw m1, m0 + psadbw m2, m0 +%if %3 + paddd %4, m1 + psubd %4, m2 %else - SWAP %3, m0 - SWAP m6, m2 + psubd m1, m2 + SWAP %4, m1 %endif %endmacro -cglobal sub8x8_dct_dc_sse2, 3,3,8 - pxor m7, m7 - DCTDC_2ROW_SSE2 0, 0, m4 - DCTDC_2ROW_SSE2 2, 1, m4 - add r1, FENC_STRIDE4 +cglobal sub8x8_dct_dc_sse2, 3,3 + pxor m0, m0 + DCTDC_2ROW_SSE2 0, 0, 0, m3 + DCTDC_2ROW_SSE2 2, 2, 1, m3 add r2, FDEC_STRIDE4 - psubd m4, m6 - DCTDC_2ROW_SSE2 0, 0, m5 - DCTDC_2ROW_SSE2 2, 1, m5 - psubd m5, m6 - packssdw m4, m5 - movhlps m5, m4 - movdq2q mm0, m4 - movdq2q mm7, m5 - DCT2x2 mm0, mm7 - movq [r0], mm0 + DCTDC_2ROW_SSE2 4, 0, 0, m4 + DCTDC_2ROW_SSE2 6, 2, 1, m4 + packssdw m3, m3 + packssdw m4, m4 + DCT2x2 m3, m4 + movq [r0], m0 + RET + +%macro SUB8x16_DCT_DC 0 +cglobal sub8x16_dct_dc, 3,3 + pxor m0, m0 + DCTDC_2ROW_SSE2 0, 0, 0, m3 + DCTDC_2ROW_SSE2 2, 2, 1, m3 + add r1, FENC_STRIDE8 + add r2, FDEC_STRIDE8 + DCTDC_2ROW_SSE2 -4, -4, 0, m4 + DCTDC_2ROW_SSE2 -2, -2, 1, m4 + shufps m3, m4, q2020 + DCTDC_2ROW_SSE2 0, 0, 0, m5 + DCTDC_2ROW_SSE2 2, 2, 1, m5 + add r2, FDEC_STRIDE4 + DCTDC_2ROW_SSE2 4, 0, 0, m4 + DCTDC_2ROW_SSE2 6, 2, 1, m4 + shufps m5, m4, q2020 +%if cpuflag(ssse3) + %define %%sign psignw +%else + %define %%sign pmullw +%endif + SUMSUB_BA d, 5, 3, 0 + packssdw m5, m3 + pshuflw m0, m5, q2301 + pshufhw m0, m0, q2301 + %%sign m5, [pw_pmpmpmpm] + paddw m0, m5 + pshufd m1, m0, q1320 + pshufd m0, m0, q0231 + %%sign m1, [pw_ppppmmmm] + paddw m0, m1 + mova [r0], m0 RET +%endmacro ; SUB8x16_DCT_DC + +INIT_XMM sse2 +SUB8x16_DCT_DC +INIT_XMM ssse3 +SUB8x16_DCT_DC + %endif ; !HIGH_BIT_DEPTH +%macro DCTDC_4ROW_SSE2 2 + mova %1, [r1+FENC_STRIDEB%2] + mova m0, [r2+FDEC_STRIDEB%2] +%assign Y (%2+1) +%rep 3 + paddw %1, [r1+FENC_STRIDEBY] + paddw m0, [r2+FDEC_STRIDEBY] +%assign Y (Y+1) +%endrep + psubw %1, m0 + pshufd m0, %1, q2301 + paddw %1, m0 +%endmacro + +%ifdef HIGH_BIT_DEPTH +%macro SUB8x8_DCT_DC_10 0 +cglobal sub8x8_dct_dc, 3,3,3 + DCTDC_4ROW_SSE2 m1, 0 + DCTDC_4ROW_SSE2 m2, 4 + mova m0, [pw_ppmmmmpp] + pmaddwd m1, m0 + pmaddwd m2, m0 + pshufd m0, m1, q2200 ; -1 -1 +0 +0 + pshufd m1, m1, q0033 ; +0 +0 +1 +1 + paddd m1, m0 + pshufd m0, m2, q1023 ; -2 +2 -3 +3 + paddd m1, m2 + paddd m1, m0 + mova [r0], m1 + RET +%endmacro +INIT_XMM sse2 +SUB8x8_DCT_DC_10 + +%macro SUB8x16_DCT_DC_10 0 +cglobal sub8x16_dct_dc, 3,3,6 + DCTDC_4ROW_SSE2 m1, 0 + DCTDC_4ROW_SSE2 m2, 4 + DCTDC_4ROW_SSE2 m3, 8 + DCTDC_4ROW_SSE2 m4, 12 + mova m0, [pw_ppmmmmpp] + pmaddwd m1, m0 + pmaddwd m2, m0 + pshufd m5, m1, q2200 ; -1 -1 +0 +0 + pshufd m1, m1, q0033 ; +0 +0 +1 +1 + paddd m1, m5 + pshufd m5, m2, q1023 ; -2 +2 -3 +3 + paddd m1, m2 + paddd m1, m5 ; a6 a2 a4 a0 + pmaddwd m3, m0 + pmaddwd m4, m0 + pshufd m5, m3, q2200 + pshufd m3, m3, q0033 + paddd m3, m5 + pshufd m5, m4, q1023 + paddd m3, m4 + paddd m3, m5 ; a7 a3 a5 a1 + paddd m0, m1, m3 + psubd m1, m3 + pshufd m0, m0, q3120 + pshufd m1, m1, q3120 + punpcklqdq m2, m0, m1 + punpckhqdq m1, m0 + mova [r0+ 0], m2 + mova [r0+16], m1 + RET +%endmacro +INIT_XMM sse2 +SUB8x16_DCT_DC_10 +INIT_XMM avx +SUB8x16_DCT_DC_10 +%endif + ;----------------------------------------------------------------------------- ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- @@ -1327,15 +1444,9 @@ mova [r0+(%1+64)SIZEOF_PIXEL], m2 mova [r0+(%1+96)*SIZEOF_PIXEL], m3 packsswb m0, m1 -%if %1 - por m6, m2 - por m7, m3 - por m5, m0 -%else - SWAP 5, 0 - SWAP 6, 2 - SWAP 7, 3 -%endif + ACCUM por, 6, 2, %1 + ACCUM por, 7, 3, %1 + ACCUM por, 5, 0, %1 %endmacro %macro ZIGZAG_8x8_CAVLC 1
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/dct.h ^
@@ -38,8 +38,13 @@ void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t pix1, uint8_t pix2 ); void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t pix1, uint8_t pix2 ); void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t pix1, uint8_t pix2 ); +void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t pix1, uint8_t pix2 ); +void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t pix1, uint8_t pix2 ); void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t pix1, uint8_t pix2 ); -void x264_sub8x8_dct_dc_sse2( int16_t dct [ 4], uint8_t pix1, uint8_t pix2 ); +void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel pix1, pixel pix2 ); +void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel pix1, pixel pix2 ); +void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t pix1, uint8_t pix2 ); +void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel pix1, pixel pix2 ); void x264_add4x4_idct_mmx ( uint8_t p_dst, int16_t dct [16] ); void x264_add4x4_idct_sse2 ( uint16_t p_dst, int32_t dct [16] ); @@ -69,20 +74,22 @@ void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t pix1, uint8_t pix2 ); void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t pix1, uint8_t pix2 ); -void x264_sub8x8_dct8_sse2 ( int16_t dct [64], uint8_t pix1, uint8_t pix2 ); -void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t pix1, uint8_t pix2 ); +void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel pix1, pixel pix2 ); +void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel pix1, pixel pix2 ); void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t pix1, uint8_t pix2 ); void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t pix1, uint8_t pix2 ); -void x264_sub8x8_dct8_avx ( int16_t dct [64], uint8_t pix1, uint8_t pix2 ); -void x264_sub16x16_dct8_avx ( int16_t dct[4][64], uint8_t pix1, uint8_t pix2 ); +void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t pix1, uint16_t pix2 ); +void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t pix1, uint16_t pix2 ); +void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel pix1, pixel pix2 ); +void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel pix1, pixel pix2 ); void x264_add8x8_idct8_mmx ( uint8_t dst, int16_t dct [64] ); void x264_add16x16_idct8_mmx ( uint8_t dst, int16_t dct[4][64] ); -void x264_add8x8_idct8_sse2 ( uint8_t dst, int16_t dct [64] ); -void x264_add16x16_idct8_sse2( uint8_t dst, int16_t dct[4][64] ); -void x264_add8x8_idct8_avx ( uint8_t dst, int16_t dct [64] ); -void x264_add16x16_idct8_avx ( uint8_t dst, int16_t dct[4][64] ); +void x264_add8x8_idct8_sse2 ( pixel dst, dctcoef dct [64] ); +void x264_add16x16_idct8_sse2( pixel dst, dctcoef dct[4][64] ); +void x264_add8x8_idct8_avx ( pixel dst, dctcoef dct [64] ); +void x264_add16x16_idct8_avx ( pixel dst, dctcoef dct[4][64] ); void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] ); void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/deblock-a.asm ^
@@ -1138,28 +1138,28 @@ ; void deblock_h_luma( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname -cglobal deblock_h_luma, 5,7 - movsxd r10, r1d - lea r11, [r10+r102] - lea r6, [r0-4] - lea r5, [r0-4+r11] +cglobal deblock_h_luma, 5,9 + movsxd r7, r1d + lea r8, [r73] + lea r6, [r0-4] + lea r5, [r0-4+r8] %ifdef WIN64 - sub rsp, 0x98 + sub rsp, 0x98 %define pix_tmp rsp+0x30 %else - sub rsp, 0x68 + sub rsp, 0x68 %define pix_tmp rsp %endif ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp - lea r6, [r6+r108] - lea r5, [r5+r108] - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp + lea r6, [r6+r78] + lea r5, [r5+r78] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %ifdef WIN64 @@ -1174,17 +1174,17 @@ movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) %ifdef WIN64 add rsp, 0x98 @@ -1516,33 +1516,33 @@ ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra, 4,7 - movsxd r10, r1d - lea r11, [r103] - lea r6, [r0-4] - lea r5, [r0-4+r11] - sub rsp, 0x88 +cglobal deblock_h_luma_intra, 4,9 + movsxd r7, r1d + lea r8, [r73] + lea r6, [r0-4] + lea r5, [r0-4+r8] + sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r6, [r6+r108] - lea r5, [r5+r108] - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r78] + lea r5, [r5+r78] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] mov r1, 0x10 call deblock_v_luma_intra ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r5, [r6+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) - add rsp, 0x88 + lea r5, [r6+r8] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) + add rsp, 0x88 RET %else cglobal deblock_h_luma_intra, 2,4 @@ -1675,7 +1675,6 @@ %macro DEBLOCK_CHROMA 0 cglobal deblock_inter_body - RESET_MM_PERMUTATION LOAD_AB m4, m5, r2, r3 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 @@ -1726,7 +1725,6 @@ cglobal deblock_intra_body - RESET_MM_PERMUTATION LOAD_AB m4, m5, r2, r3 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 @@ -1770,7 +1768,107 @@ dec r4 jg .loop REP_RET -%endmacro + +;----------------------------------------------------------------------------- +; void deblock_h_chroma_intra_mbaff( uint16_t pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_chroma_intra_mbaff, 4,6,8 + add r1, r1 +%if mmsize == 8 + mov r4, 16/mmsize +.loop: +%else + lea r5, [r13] +%endif + CHROMA_H_LOAD r5 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 + CHROMA_H_STORE r5 +%if mmsize == 8 + lea r0, [r0+r1(mmsize/4)] + dec r4 + jg .loop +%endif + REP_RET + +;----------------------------------------------------------------------------- +; void deblock_h_chroma_mbaff( uint16_t pix, int stride, int alpha, int beta, int8_t tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_h_chroma_mbaff, 5,7,8 + add r1, r1 + lea r6, [r13] +%if mmsize == 8 + mov r5, 16/mmsize +.loop: +%endif + CHROMA_H_LOAD r6 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + movd m6, [r4] + punpcklbw m6, m6 + psraw m6, 8 + punpcklwd m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_H_STORE r6 +%if mmsize == 8 + lea r0, [r0+r1(mmsize/4)] + add r4, mmsize/4 + dec r5 + jg .loop +%endif + REP_RET + +;----------------------------------------------------------------------------- +; void deblock_h_chroma_422_intra( uint16_t pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_chroma_422_intra, 4,6,8 + add r1, r1 + mov r4, 64/mmsize +%if mmsize == 16 + lea r5, [r13] +%endif +.loop: + CHROMA_H_LOAD r5 + call deblock_intra_body + CHROMA_H_STORE r5 + lea r0, [r0+r1(mmsize/4)] + dec r4 + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void deblock_h_chroma_422( uint16_t pix, int stride, int alpha, int beta, int8_t tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_h_chroma_422, 5,7,8 + add r1, r1 + mov r5, 64/mmsize + lea r6, [r13] +.loop: + CHROMA_H_LOAD r6 + LOAD_AB m4, m5, r2m, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + pxor m4, m4 + movd m6, [r4-1] + psraw m6, 8 + SPLATW m6, m6 + pmaxsw m6, m4 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_H_STORE r6 + lea r0, [r0+r1(mmsize/4)] +%if mmsize == 16 + inc r4 +%else + mov r2, r5 + and r2, 1 + add r4, r2 ; increment once every 2 iterations +%endif + dec r5 + jg .loop + REP_RET +%endmacro ; DEBLOCK_CHROMA %ifndef ARCH_X86_64 INIT_MMX mmx2 @@ -1791,7 +1889,7 @@ sub t5, r1 %if mmsize==8 mov dword r0m, 2 -.skip_prologue: +.loop: %endif %endmacro @@ -1802,10 +1900,6 @@ lea t6, [r13] mov t5, r0 add r0, t6 -%if mmsize==8 - mov dword r0m, 2 -.skip_prologue: -%endif %endmacro %macro CHROMA_V_LOOP 1 @@ -1816,7 +1910,7 @@ add r4, 2 %endif dec dword r0m - jg .skip_prologue + jg .loop %endif %endmacro @@ -1828,7 +1922,7 @@ add r4, 2 %endif dec dword r0m - jg .skip_prologue + jg .loop %endif %endmacro @@ -1865,6 +1959,10 @@ ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 CHROMA_H_START +%if mmsize==8 + mov dword r0m, 2 +.loop: +%endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_inter_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) @@ -1881,21 +1979,44 @@ DEBLOCK_CHROMA %endif -%macro DEBLOCK_H_CHROMA_422 0 -cglobal deblock_h_chroma_422, 5,7,8 -%ifdef ARCH_X86_64 - %define cntr r11 -%else - %define cntr dword r0m -%endif +;----------------------------------------------------------------------------- +; void deblock_h_chroma_mbaff( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +;----------------------------------------------------------------------------- +%macro DEBLOCK_H_CHROMA_420_MBAFF 0 +cglobal deblock_h_chroma_mbaff, 5,7,8 dec r2d dec r3d sub r0, 4 lea t6, [r13] mov t5, r0 add r0, t6 + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + RET +%endmacro + +INIT_XMM sse2 +DEBLOCK_H_CHROMA_420_MBAFF +%ifndef ARCH_X86_64 +INIT_MMX mmx2 +DEBLOCK_H_CHROMA_420_MBAFF +%endif + +%macro DEBLOCK_H_CHROMA_422 0 +cglobal deblock_h_chroma_422, 5,8,8 +%ifdef ARCH_X86_64 + %define cntr r7 +%else + %define cntr dword r0m +%endif + CHROMA_H_START mov cntr, 32/mmsize -.skip_prologue: +.loop: TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 @@ -1913,7 +2034,7 @@ lea t5, [t5+r1(mmsize/2)] add r4, mmsize/8 dec cntr - jg .skip_prologue + jg .loop REP_RET %endmacro @@ -1937,7 +2058,7 @@ %define t5 r4 %define t6 r5 -%macro DEBLOCK_CHROMA_INTRA 0 +%macro DEBLOCK_CHROMA_INTRA_BODY 0 cglobal chroma_intra_body LOAD_MASK r2d, r3d mova m5, m1 @@ -1951,7 +2072,9 @@ paddb m1, m5 paddb m2, m6 ret +%endmacro +%macro DEBLOCK_CHROMA_INTRA 0 ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint8_t pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- @@ -1972,21 +2095,52 @@ ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 CHROMA_H_START +%if mmsize==8 + mov dword r0m, 2 +.loop: +%endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) CHROMA_H_LOOP 0 RET + +cglobal deblock_h_chroma_422_intra, 4,7,8 + CHROMA_H_START + mov r6d, 32/mmsize +.loop: + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_intra_body + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + lea r0, [r0+r1(mmsize/2)] + lea t5, [t5+r1(mmsize/2)] + dec r6d + jg .loop + REP_RET %endmacro ; DEBLOCK_CHROMA_INTRA INIT_XMM sse2 +DEBLOCK_CHROMA_INTRA_BODY DEBLOCK_CHROMA_INTRA INIT_XMM avx +DEBLOCK_CHROMA_INTRA_BODY DEBLOCK_CHROMA_INTRA -%ifndef ARCH_X86_64 INIT_MMX mmx2 +DEBLOCK_CHROMA_INTRA_BODY +%ifndef ARCH_X86_64 DEBLOCK_CHROMA_INTRA %endif + +;----------------------------------------------------------------------------- +; void deblock_h_chroma_intra_mbaff( uint8_t pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +INIT_MMX mmx2 +cglobal deblock_h_chroma_intra_mbaff, 4,6,8 + CHROMA_H_START + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_intra_body + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + RET %endif ; !HIGH_BIT_DEPTH
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/mc-a.asm ^
@@ -58,13 +58,16 @@ ; implicit weighted biprediction ;============================================================================= ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 -%ifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,10,11 - %macro AVG_START 0-1 0 - PROLOGUE 6,7,%1 %ifdef WIN64 - movsxd r5, r5d -%endif + DECLARE_REG_TMP 0,1,2,3,4,5,4,5 + %macro AVG_START 0-1 0 + PROLOGUE 5,7,%1 + movsxd r5, dword r5m + %endmacro +%elifdef UNIX64 + DECLARE_REG_TMP 0,1,2,3,4,5,7,8 + %macro AVG_START 0-1 0 + PROLOGUE 6,9,%1 %endmacro %else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 @@ -1157,7 +1160,9 @@ jg avg_w16_align%1_%2_ssse3 ret %if %1==0 - times 13 db 0x90 ; make sure the first ones don't end up short + ; make sure the first ones don't end up short + ALIGN 16 + times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop %endif %endmacro @@ -1171,7 +1176,7 @@ and eax, 7 jz x264_pixel_avg2_w16_sse2 %endif - PROLOGUE 6, 7 + PROLOGUE 6, 8 lea r6, [r4+r2] and r4, ~0xf and r6, 0x1f @@ -1181,8 +1186,8 @@ shl r6, 4 ;jump = (offset + align2)48 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) %ifdef PIC - lea r11, [avg_w16_addr] - add r6, r11 + lea r7, [avg_w16_addr] + add r6, r7 %else lea r6, [avg_w16_addr + r6] %endif @@ -1393,17 +1398,22 @@ ;============================================================================= %ifdef ARCH_X86_64 - DECLARE_REG_TMP 10,11,6 + DECLARE_REG_TMP 6,7,8 %else DECLARE_REG_TMP 0,1,2 %endif -%macro MC_CHROMA_START 0 +%macro MC_CHROMA_START 1 +%ifdef ARCH_X86_64 + PROLOGUE 0,9,%1 +%else + PROLOGUE 0,6,%1 +%endif movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - movifnidn t2d, r6m - mov t0d, t2d + movifnidn t0d, r6m + mov t2d, t0d mov t1d, r5d sar t0d, 3 sar t1d, 3 @@ -1447,8 +1457,8 @@ ; int width, int height ) ;----------------------------------------------------------------------------- %macro MC_CHROMA 0 -cglobal mc_chroma, 0,6 - MC_CHROMA_START +cglobal mc_chroma + MC_CHROMA_START 0 FIX_STRIDES r4 and r5d, 7 %ifdef ARCH_X86_64 @@ -1726,8 +1736,8 @@ movifnidn r5d, r8m cmp dword r7m, 4 jg .mc1d_w8 - mov r10, r2 - mov r11, r4 + mov r7, r2 + mov r8, r4 %if mmsize!=8 shr r5d, 1 %endif @@ -1741,7 +1751,7 @@ %else movu m0, [r3] movu m1, [r3+r6] - add r3, r11 + add r3, r8 movu m2, [r3] movu m3, [r3+r6] %endif @@ -1757,7 +1767,7 @@ movq m0, [r3] movq m1, [r3+r6] %if mmsize!=8 - add r3, r11 + add r3, r8 movhps m0, [r3] movhps m1, [r3+r6] %endif @@ -1778,22 +1788,22 @@ psrlw m2, 3 %ifdef HIGH_BIT_DEPTH %if mmsize == 8 - xchg r4, r11 - xchg r2, r10 + xchg r4, r8 + xchg r2, r7 %endif movq [r0], m0 movq [r1], m2 %if mmsize == 16 - add r0, r10 - add r1, r10 + add r0, r7 + add r1, r7 movhps [r0], m0 movhps [r1], m2 %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m2 %if mmsize==8 - xchg r4, r11 - xchg r2, r10 + xchg r4, r8 + xchg r2, r7 movd [r0], m0 psrlq m0, 32 movd [r1], m0 @@ -1801,8 +1811,8 @@ movhlps m1, m0 movd [r0], m0 movd [r1], m1 - add r0, r10 - add r1, r10 + add r0, r7 + add r1, r7 psrldq m0, 4 psrldq m1, 4 movd [r0], m0 @@ -1818,8 +1828,8 @@ .mc1d_w8: sub r2, 4SIZEOF_PIXEL sub r4, 8SIZEOF_PIXEL - mov r10, 4SIZEOF_PIXEL - mov r11, 8SIZEOF_PIXEL + mov r7, 4SIZEOF_PIXEL + mov r8, 8SIZEOF_PIXEL %if mmsize==8 shl r5d, 1 %endif @@ -1827,10 +1837,9 @@ %endif ; ARCH_X86_64 %endmacro ; MC_CHROMA - %macro MC_CHROMA_SSSE3 0 -cglobal mc_chroma, 0,6,9 - MC_CHROMA_START +cglobal mc_chroma + MC_CHROMA_START 9 and r5d, 7 and t2d, 7 mov t0d, r5d
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/mc-a2.asm ^
@@ -660,7 +660,7 @@ mova %1, m1 mova %2, m4 FILT_PACK m1, m4, 5, m15 - movntps [r11+r4+%5], m1 + movntps [r8+r4+%5], m1 %endmacro %macro FILT_C 4 @@ -728,26 +728,26 @@ ; void hpel_filter( uint8_t dsth, uint8_t dstv, uint8_t dstc, ; uint8_t src, int stride, int width, int height) ;----------------------------------------------------------------------------- -cglobal hpel_filter, 7,7,16 +cglobal hpel_filter, 7,9,16 %ifdef WIN64 movsxd r4, r4d movsxd r5, r5d %endif - mov r10, r3 + mov r7, r3 sub r5, 16 - mov r11, r1 - and r10, 15 - sub r3, r10 + mov r8, r1 + and r7, 15 + sub r3, r7 add r0, r5 - add r11, r5 - add r10, r5 + add r8, r5 + add r7, r5 add r5, r2 mov r2, r4 - neg r10 + neg r7 lea r1, [r3+r2] sub r3, r2 sub r3, r2 - mov r4, r10 + mov r4, r7 mova m15, [pw_16] %if cpuflag(ssse3) mova m0, [filt_mul51] @@ -774,14 +774,14 @@ cmp r4, 16 jl .lastx ; setup regs for next y - sub r4, r10 + sub r4, r7 sub r4, r2 sub r1, r4 sub r3, r4 add r0, r2 - add r11, r2 + add r8, r2 add r5, r2 - mov r4, r10 + mov r4, r7 sub r6d, 1 jg .loopy sfence @@ -950,7 +950,7 @@ ; uint8_t srcv, int i_srcv, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>2w -cglobal plane_copy_interleave_core, 7,7 +cglobal plane_copy_interleave_core, 7,9 FIX_STRIDES r1d, r3d, r5d, r6d %ifdef HIGH_BIT_DEPTH mov r1m, r1d @@ -965,7 +965,7 @@ add r2, r6 add r4, r6 %ifdef ARCH_X86_64 - DECLARE_REG_TMP 10,11 + DECLARE_REG_TMP 7,8 %else DECLARE_REG_TMP 1,3 %endif
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/pixel-a.asm ^
@@ -130,7 +130,7 @@ cextern pw_1 cextern pw_8 cextern pw_16 -cextern pw_64 +cextern pw_32 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm @@ -1267,15 +1267,21 @@ %macro BACKUP_POINTERS 0 %ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 +%ifdef WIN64 + PUSH r7 +%endif + mov r6, r0 + mov r7, r2 %endif %endmacro %macro RESTORE_AND_INC_POINTERS 0 %ifdef ARCH_X86_64 - lea r0, [r10+8] - lea r2, [r11+8] + lea r0, [r6+8] + lea r2, [r7+8] +%ifdef WIN64 + POP r7 +%endif %else mov r0, r0mp mov r2, r2mp @@ -1473,10 +1479,10 @@ ; int pixel_sa8d_8x8( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal - lea r10, [r0+4r1] - lea r11, [r2+4r3] + lea r6, [r0+4r1] + lea r7, [r2+4r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 - LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11 + LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 %if vertical HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax %else ; non-sse2 @@ -1488,7 +1494,7 @@ SAVE_MM_PERMUTATION ret -cglobal pixel_sa8d_8x8, 4,6,12 +cglobal pixel_sa8d_8x8, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3r1] lea r5, [3r3] @@ -1506,7 +1512,7 @@ shr eax, 1 RET -cglobal pixel_sa8d_16x16, 4,6,12 +cglobal pixel_sa8d_16x16, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3r1] lea r5, [3r3] @@ -1794,6 +1800,12 @@ INIT_MMX cglobal hadamard_load ; not really a global, but otherwise cycles get attributed to the wrong function in profiling +%ifdef HIGH_BIT_DEPTH + mova m0, [r0+0FENC_STRIDEB] + mova m1, [r0+1FENC_STRIDEB] + mova m2, [r0+2FENC_STRIDEB] + mova m3, [r0+3FENC_STRIDEB] +%else pxor m7, m7 movd m0, [r0+0FENC_STRIDE] movd m1, [r0+1FENC_STRIDE] @@ -1803,24 +1815,31 @@ punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 +%endif HADAMARD4_2D 0, 1, 2, 3, 4 SAVE_MM_PERMUTATION ret %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp %ifidn %1, top - movd %3, [r1+%2-FDEC_STRIDE] +%ifdef HIGH_BIT_DEPTH + mova %3, [r1+%2SIZEOF_PIXEL-FDEC_STRIDEB] +%else + movd %3, [r1+%2SIZEOF_PIXEL-FDEC_STRIDEB] pxor %5, %5 punpcklbw %3, %5 +%endif %else ; left %ifnidn %2, 0 - shl %2d, 5 ; log(FDEC_STRIDE) + shl %2d, 5 ; log(FDEC_STRIDEB) %endif - movd %3, [r1+%2-4+1FDEC_STRIDE] - pinsrw %3, [r1+%2-2+0FDEC_STRIDE], 0 - pinsrw %3, [r1+%2-2+2FDEC_STRIDE], 2 - pinsrw %3, [r1+%2-2+3FDEC_STRIDE], 3 + movd %3, [r1+%2SIZEOF_PIXEL-4+1FDEC_STRIDEB] + pinsrw %3, [r1+%2SIZEOF_PIXEL-2+0FDEC_STRIDEB], 0 + pinsrw %3, [r1+%2SIZEOF_PIXEL-2+2FDEC_STRIDEB], 2 + pinsrw %3, [r1+%2SIZEOF_PIXEL-2+3FDEC_STRIDEB], 3 +%ifndef HIGH_BIT_DEPTH psrlw %3, 8 +%endif %ifnidn %2, 0 shr %2d, 5 %endif @@ -1859,19 +1878,6 @@ %8 %3, %6 %endmacro -%macro CLEAR_SUMS 0 -%ifdef ARCH_X86_64 - mov qword [sums+0], 0 - mov qword [sums+8], 0 - mov qword [sums+16], 0 -%else - pxor m7, m7 - movq [sums+0], m7 - movq [sums+8], m7 - movq [sums+16], m7 -%endif -%endmacro - ; in: m1..m3 ; out: m7 ; clobber: m4..m6 @@ -1942,45 +1948,47 @@ %endif RET -%ifdef ARCH_X86_64 - %define t0 r10 - %define t2 r11 -%else - %define t0 r0 - %define t2 r2 -%endif - ;----------------------------------------------------------------------------- ; void intra_satd_x3_16x16( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_16x16, 0,5 - %assign stack_pad 88 + ((stack_offset+88+gprsize)&15) + %assign stack_pad 120 + ((stack_offset+120+gprsize)&15) ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, stack_pad -%define sums rsp+64 ; size 24 +%define sums rsp+64 ; size 56 %define top_1d rsp+32 ; size 32 %define left_1d rsp ; size 32 movifnidn r1, r1mp - CLEAR_SUMS + + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%ifdef HIGH_BIT_DEPTH + mova [sums+24], m7 + mova [sums+32], m7 + mova [sums+40], m7 + mova [sums+48], m7 +%endif ; 1D hadamards - mov t0d, 12 - movd m6, [pw_64] + mov r3d, 12 + movd m6, [pw_32] .loop_edge: - SCALAR_HADAMARD left, t0, m0, m1 - SCALAR_HADAMARD top, t0, m1, m2, m3 - paddw m6, m0 - paddw m6, m1 - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m1, m2, m3 + pavgw m0, m1 + paddw m6, m0 + sub r3d, 4 jge .loop_edge - psrlw m6, 3 - pand m6, [sw_f0] ; dc + psrlw m6, 2 + pand m6, [sw_f0] ; dc ; 2D hadamards - movifnidn r0, r0mp - mov r3, -4 + movifnidn r0, r0mp + mov r3, -4 .loop_y: - mov r4, -4 + mov r4, -4 .loop_x: call hadamard_load @@ -1988,38 +1996,74 @@ SUM4x3 m6, [left_1d+8(r3+4)], [top_1d+8(r4+4)] pavgw m4, m7 pavgw m5, m7 - paddw m0, [sums+0] ; i16x16_v satd - paddw m4, [sums+8] ; i16x16_h satd + paddw m0, [sums+ 0] ; i16x16_v satd + paddw m4, [sums+ 8] ; i16x16_h satd paddw m5, [sums+16] ; i16x16_dc satd - movq [sums+0], m0 - movq [sums+8], m4 - movq [sums+16], m5 + mova [sums+ 0], m0 + mova [sums+ 8], m4 + mova [sums+16], m5 - add r0, 4 + add r0, 4SIZEOF_PIXEL inc r4 jl .loop_x - add r0, 4FENC_STRIDE-16 +%ifdef HIGH_BIT_DEPTH + mova m7, [pw_1] + pmaddwd m4, m7 + pmaddwd m0, m7 + paddd m4, [sums+32] + paddd m0, [sums+24] + mova [sums+32], m4 + mova [sums+24], m0 + pxor m7, m7 + punpckhwd m3, m5, m7 + punpcklwd m5, m7 + paddd m3, [sums+48] + paddd m5, [sums+40] + mova [sums+48], m3 + mova [sums+40], m5 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%endif + add r0, 4FENC_STRIDEB-16SIZEOF_PIXEL inc r3 jl .loop_y ; horizontal sum movifnidn r2, r2mp - movq m2, [sums+16] - movq m1, [sums+8] - movq m0, [sums+0] - movq m7, m2 - SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd +%ifdef HIGH_BIT_DEPTH + mova m1, m5 + paddd m5, m3 + HADDD m5, m7 ; DC satd + HADDD m4, m7 ; H satd + HADDD m0, m7 ; the part of V satd that doesn't overlap with DC + psrld m0, 1 + psrlq m1, 32 ; DC[1] + paddd m0, m3 ; DC[2] + psrlq m3, 32 ; DC[3] + paddd m0, m1 + paddd m0, m3 +%else + mova m7, m5 + SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd psrld m0, 1 pslld m7, 16 psrld m7, 16 - paddd m0, m2 + paddd m0, m5 psubd m0, m7 - movd [r2+8], m2 ; i16x16_dc satd - movd [r2+4], m1 ; i16x16_h satd - movd [r2+0], m0 ; i16x16_v satd - ADD rsp, stack_pad +%endif + movd [r2+8], m5 ; i16x16_dc satd + movd [r2+4], m4 ; i16x16_h satd + movd [r2+0], m0 ; i16x16_v satd + ADD rsp, stack_pad RET +%ifdef ARCH_X86_64 + %define t0 r6 +%else + %define t0 r2 +%endif + ;----------------------------------------------------------------------------- ; void intra_satd_x3_8x8c( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- @@ -2031,32 +2075,35 @@ %define top_1d rsp+16 ; size 16 %define left_1d rsp ; size 16 movifnidn r1, r1mp - CLEAR_SUMS + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 ; 1D hadamards - mov t0d, 4 + mov r3d, 4 .loop_edge: - SCALAR_HADAMARD left, t0, m0, m1 - SCALAR_HADAMARD top, t0, m0, m1, m2 - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m0, m1, m2 + sub r3d, 4 jge .loop_edge ; dc - movzx t2d, word [left_1d+0] + movzx t0d, word [left_1d+0] movzx r3d, word [top_1d+0] movzx r4d, word [left_1d+8] movzx r5d, word [top_1d+8] - lea t2d, [t2 + r3 + 16] + lea t0d, [t0 + r3 + 16] lea r3d, [r4 + r5 + 16] - shr t2d, 1 + shr t0d, 1 shr r3d, 1 add r4d, 8 add r5d, 8 - and t2d, -16 ; tl + and t0d, -16 ; tl and r3d, -16 ; br and r4d, -16 ; bl and r5d, -16 ; tr - mov [dc_1d+ 0], t2d ; tl + mov [dc_1d+ 0], t0d ; tl mov [dc_1d+ 4], r5d ; tr mov [dc_1d+ 8], r4d ; bl mov [dc_1d+12], r3d ; br @@ -2082,10 +2129,10 @@ movq [sums+8], m4 movq [sums+0], m5 - add r0, 4 + add r0, 4SIZEOF_PIXEL inc r4 jl .loop_x - add r0, 4FENC_STRIDE-8 + add r0, 4FENC_STRIDEB-8SIZEOF_PIXEL add r5, 8 inc r3 jl .loop_y @@ -2095,10 +2142,18 @@ movq m1, [sums+8] movq m2, [sums+16] movq m7, m0 +%ifdef HIGH_BIT_DEPTH + psrlq m7, 16 + HADDW m7, m3 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m2, 1 + paddd m2, m7 +%else psrlq m7, 15 paddw m2, m7 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 +%endif movd [r2+0], m0 ; i8x8c_dc satd movd [r2+4], m1 ; i8x8c_h satd movd [r2+8], m2 ; i8x8c_v satd @@ -3717,9 +3772,9 @@ SATDS_SSE2 %ifndef HIGH_BIT_DEPTH INTRA_SA8D_SSE2 +%endif INIT_MMX mmx2 INTRA_X3_MMX -%endif INIT_XMM sse2 HADAMARD_AC_SSE2 @@ -3808,13 +3863,8 @@ pmaddwd m7, m5, m6 pmaddwd m5, m5 pmaddwd m6, m6 -%if %1==0 - SWAP 3, 5 - SWAP 4, 7 -%else - paddd m3, m5 - paddd m4, m7 -%endif + ACCUM paddd, 3, 5, %1 + ACCUM paddd, 4, 7, %1 paddd m3, m6 %endmacro
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/pixel.h ^
@@ -98,6 +98,9 @@ void x264_intra_satd_x3_4x4_mmx2 ( pixel , pixel , int * ); void x264_intra_sad_x3_4x4_mmx2 ( pixel , pixel , int * ); +void x264_intra_sad_x3_4x4_sse2 ( pixel , pixel , int * ); +void x264_intra_sad_x3_4x4_ssse3 ( pixel , pixel , int * ); +void x264_intra_sad_x3_4x4_avx ( pixel , pixel , int * ); void x264_intra_satd_x3_8x8c_mmx2 ( pixel , pixel , int * ); void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t , uint8_t , int * ); void x264_intra_sad_x3_8x8c_mmx2 ( pixel , pixel , int * ); @@ -112,6 +115,7 @@ void x264_intra_sa8d_x3_8x8_sse2 ( pixel , pixel , int * ); void x264_intra_sad_x3_8x8_mmx2 ( pixel , pixel , int * ); void x264_intra_sad_x3_8x8_sse2 ( pixel , pixel , int * ); +void x264_intra_sad_x3_8x8_ssse3 ( pixel , pixel , int * ); int x264_intra_satd_x9_4x4_ssse3( uint8_t , uint8_t , uint16_t * ); int x264_intra_satd_x9_4x4_sse4 ( uint8_t , uint8_t , uint16_t * ); int x264_intra_satd_x9_4x4_avx ( uint8_t , uint8_t , uint16_t * ); @@ -153,6 +157,8 @@ int x264_pixel_var2_8x16_xop( uint8_t , int, uint8_t , int, int * ); int x264_pixel_vsad_mmx2( pixel src, int stride, int height ); int x264_pixel_vsad_sse2( pixel src, int stride, int height ); +int x264_pixel_vsad_ssse3( pixel src, int stride, int height ); +int x264_pixel_vsad_xop( pixel src, int stride, int height ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/predict-a.asm ^
@@ -34,6 +34,7 @@ pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 8 dw -3 +pw_m7: times 8 dw -7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff @@ -1079,36 +1080,42 @@ ; void predict_8x8c_p_core( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- %ifndef ARCH_X86_64 -INIT_MMX -cglobal predict_8x8c_p_core_mmx2, 1,2 +%ifndef HIGH_BIT_DEPTH +%macro PREDICT_CHROMA_P_MMX 1 +cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS - movq mm1, mm2 - pmullw mm2, [pw_3210] - psllw mm1, 2 - paddsw mm0, mm2 ; mm0 = {i+0b, i+1b, i+2b, i+3b} - paddsw mm1, mm0 ; mm1 = {i+4b, i+5b, i+6b, i+7b} - - mov r1d, 8 + movq m1, m2 + pmullw m2, [pw_3210] + psllw m1, 2 + paddsw m0, m2 ; m0 = {i+0b, i+1b, i+2b, i+3b} + paddsw m1, m0 ; m1 = {i+4b, i+5b, i+6b, i+7b} + mov r1d, %1 ALIGN 4 .loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0], mm5 + movq m5, m0 + movq m6, m1 + psraw m5, 5 + psraw m6, 5 + packuswb m5, m6 + movq [r0], m5 - paddsw mm0, mm4 - paddsw mm1, mm4 + paddsw m0, m4 + paddsw m1, m4 add r0, FDEC_STRIDE dec r1d - jg .loop + jg .loop REP_RET +%endmacro ; PREDICT_CHROMA_P_MMX + +INIT_MMX mmx2 +PREDICT_CHROMA_P_MMX 8 +PREDICT_CHROMA_P_MMX 16 +%endif ; !HIGH_BIT_DEPTH %endif ; !ARCH_X86_64 -%macro PREDICT_8x8C 0 +%macro PREDICT_CHROMA_P_XMM 1 %ifdef HIGH_BIT_DEPTH -cglobal predict_8x8c_p_core, 1,1,7 +cglobal predict_8x%1c_p_core, 1,2,7 movd m0, r1m movd m2, r2m movd m4, r3m @@ -1118,9 +1125,13 @@ SPLATW m2, m2, 0 SPLATW m4, m4, 0 pmullw m2, [pw_43210123] ; b - pmullw m5, m4, [pw_m3] ; c +%if %1 == 16 + pmullw m5, m4, [pw_m7] ; c +%else + pmullw m5, m4, [pw_m3] +%endif paddw m5, [pw_16] - mov r1d, 8 + mov r1d, %1 .loop: paddsw m6, m2, m5 paddsw m6, m0 @@ -1129,11 +1140,11 @@ mova [r0], m6 paddw m5, m4 add r0, FDEC_STRIDEB - dec r1d + dec r1d jg .loop REP_RET %else ; !HIGH_BIT_DEPTH -cglobal predict_8x8c_p_core, 1,1 +cglobal predict_8x%1c_p_core, 1,2 movd m0, r1m movd m2, r2m movd m4, r3m @@ -1144,8 +1155,7 @@ paddsw m0, m2 ; m0 = {i+0b, i+1b, i+2b, i+3b, i+4b, i+5b, i+6b, i+7b} paddsw m3, m0, m4 paddsw m4, m4 -call .loop - add r0, FDEC_STRIDE4 + mov r1d, %1/4 .loop: paddsw m1, m3, m4 paddsw m5, m0, m4 @@ -1161,14 +1171,19 @@ packuswb m5, m1 movq [r0+FDEC_STRIDE2], m5 movhps [r0+FDEC_STRIDE3], m5 + add r0, FDEC_STRIDE4 + dec r1d + jg .loop RET %endif ; HIGH_BIT_DEPTH -%endmacro +%endmacro ; PREDICT_CHROMA_P_XMM INIT_XMM sse2 -PREDICT_8x8C +PREDICT_CHROMA_P_XMM 8 +PREDICT_CHROMA_P_XMM 16 INIT_XMM avx -PREDICT_8x8C +PREDICT_CHROMA_P_XMM 8 +PREDICT_CHROMA_P_XMM 16 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core( uint8_t src, int i00, int b, int c ) @@ -1407,6 +1422,51 @@ %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- +; void predict_8x8_vl( pixel src, pixel edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_VL_10 1 +cglobal predict_8x8_vl, 2,2,8 + mova m0, [r1+16SIZEOF_PIXEL] + mova m1, [r1+24SIZEOF_PIXEL] + PALIGNR m2, m1, m0, SIZEOF_PIXEL1, m4 + PSRLPIX m4, m1, 1 + pavg%1 m6, m0, m2 + pavg%1 m7, m1, m4 + add r0, FDEC_STRIDEB4 + mova [r0-4FDEC_STRIDEB], m6 + PALIGNR m3, m7, m6, SIZEOF_PIXEL1, m5 + mova [r0-2FDEC_STRIDEB], m3 + PALIGNR m3, m7, m6, SIZEOF_PIXEL2, m5 + mova [r0+0FDEC_STRIDEB], m3 + PALIGNR m7, m7, m6, SIZEOF_PIXEL3, m5 + mova [r0+2FDEC_STRIDEB], m7 + PALIGNR m3, m1, m0, SIZEOF_PIXEL7, m6 + PSLLPIX m5, m0, 1 + PRED8x8_LOWPASS m0, m5, m2, m0, m7 + PRED8x8_LOWPASS m1, m3, m4, m1, m7 + PALIGNR m4, m1, m0, SIZEOF_PIXEL1, m2 + mova [r0-3FDEC_STRIDEB], m4 + PALIGNR m4, m1, m0, SIZEOF_PIXEL2, m2 + mova [r0-1FDEC_STRIDEB], m4 + PALIGNR m4, m1, m0, SIZEOF_PIXEL3, m2 + mova [r0+1FDEC_STRIDEB], m4 + PALIGNR m1, m1, m0, SIZEOF_PIXEL4, m2 + mova [r0+3FDEC_STRIDEB], m1 + RET +%endmacro +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_VL_10 w +INIT_XMM ssse3 +PREDICT_8x8_VL_10 w +INIT_XMM avx +PREDICT_8x8_VL_10 w +%else +INIT_MMX mmx2 +PREDICT_8x8_VL_10 b +%endif + +;----------------------------------------------------------------------------- ; void predict_8x8_hd( pixel src, pixel edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 2 @@ -1618,7 +1678,6 @@ ;----------------------------------------------------------------------------- %ifdef HIGH_BIT_DEPTH -INIT_XMM sse2 %macro PREDICT_C_H 1 cglobal predict_8x%1c_h, 1,1 add r0, FDEC_STRIDEB4 @@ -1627,11 +1686,18 @@ movd m0, [r0+FDEC_STRIDEBY-SIZEOF_PIXEL2] SPLATW m0, m0, 1 mova [r0+FDEC_STRIDEBY], m0 +%if mmsize == 8 + mova [r0+FDEC_STRIDEB*Y+8], m0 +%endif %assign Y Y+1 %endrep RET %endmacro +INIT_MMX mmx2 +PREDICT_C_H 8 +PREDICT_C_H 16 +INIT_XMM sse2 PREDICT_C_H 8 PREDICT_C_H 16
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/predict-c.c ^
@@ -99,6 +99,41 @@ PREDICT_16x16_P( avx ) #endif //!HIGH_BIT_DEPTH +#define PREDICT_8x16C_P_CORE \ + int H = 0, V = 0;\ + for( int i = 0; i < 4; i++ )\ + H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\ + for( int i = 0; i < 8; i++ )\ + V += ( i + 1 ) * ( src[-1 + (i+8)FDEC_STRIDE] - src[-1 + (6-i)FDEC_STRIDE] );\ + int a = 16 * ( src[-1 + 15FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\ + int b = ( 17 H + 16 ) >> 5;\ + int c = ( 5 * V + 32 ) >> 6; + +#if HIGH_BIT_DEPTH +#define PREDICT_8x16_P(name)\ +static void x264_predict_8x16c_p_##name( uint16_t src )\ +{\ + PREDICT_8x16C_P_CORE \ + x264_predict_8x16c_p_core_##name( src, a, b, c );\ +} + +PREDICT_8x16_P(sse2) +PREDICT_8x16_P(avx) +#else +#define PREDICT_8x16_P(name)\ +static void x264_predict_8x16c_p_##name( uint8_t src )\ +{\ + PREDICT_8x16C_P_CORE \ + int i00 = a -3b -7c + 16;\ + x264_predict_8x16c_p_core_##name( src, i00, b, c );\ +} +#ifndef ARCH_X86_64 +PREDICT_8x16_P(mmx2) +#endif +PREDICT_8x16_P(sse2) +PREDICT_8x16_P(avx) +#endif + #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH static void x264_predict_16x16_p_sse2( uint16_t src ) @@ -338,6 +373,7 @@ if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse2; @@ -386,12 +422,17 @@ if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; + pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2; + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_sse2; + if( !(cpu&X264_CPU_AVX) ) + return; + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx; #else pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx; if( !(cpu&X264_CPU_MMX2) ) @@ -399,9 +440,18 @@ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2; +#ifndef ARCH_X86_64 + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_mmx2; +#endif + if( !(cpu&X264_CPU_SSE2) ) + return; + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_ssse3; + if( !(cpu&X264_CPU_AVX) ) + return; + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx; #endif // HIGH_BIT_DEPTH } @@ -419,6 +469,7 @@ pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; @@ -429,6 +480,7 @@ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_ssse3; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; predict_8x8_filter = x264_predict_8x8_filter_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) @@ -440,6 +492,7 @@ return; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx; predict_8x8_filter = x264_predict_8x8_filter_avx; #else @@ -449,6 +502,7 @@ pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2; pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_mmx2; predict_8x8_filter = x264_predict_8x8_filter_mmx2; #if ARCH_X86 pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2;
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/predict.h ^
@@ -56,9 +56,12 @@ void x264_predict_8x16c_dc_top_sse2( uint16_t src ); void x264_predict_8x16c_v_mmx( uint8_t src ); void x264_predict_8x16c_v_sse2( uint16_t src ); -void x264_predict_8x16c_h_mmx2( uint8_t src ); +void x264_predict_8x16c_h_mmx2( pixel src ); void x264_predict_8x16c_h_sse2( pixel src ); void x264_predict_8x16c_h_ssse3( uint8_t src ); +void x264_predict_8x16c_p_core_mmx2( uint8_t src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_sse2( pixel src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_avx( pixel src, int i00, int b, int c ); void x264_predict_8x8c_p_core_mmx2( uint8_t src, int i00, int b, int c ); void x264_predict_8x8c_p_core_sse2( pixel src, int i00, int b, int c ); void x264_predict_8x8c_p_core_avx( pixel src, int i00, int b, int c ); @@ -68,7 +71,7 @@ void x264_predict_8x8c_dc_top_sse2( uint16_t src ); void x264_predict_8x8c_v_mmx( pixel src ); void x264_predict_8x8c_v_sse2( uint16_t src ); -void x264_predict_8x8c_h_mmx2( uint8_t src ); +void x264_predict_8x8c_h_mmx2( pixel src ); void x264_predict_8x8c_h_sse2( pixel src ); void x264_predict_8x8c_h_ssse3( uint8_t src ); void x264_predict_8x8_v_mmx2( uint8_t src, uint8_t edge[36] ); @@ -93,8 +96,10 @@ void x264_predict_8x8_ddr_ssse3( pixel src, pixel edge[36] ); void x264_predict_8x8_ddr_ssse3_cache64( pixel src, pixel edge[36] ); void x264_predict_8x8_ddr_avx( pixel src, pixel edge[36] ); -void x264_predict_8x8_vl_sse2( uint8_t src, uint8_t edge[36] ); -void x264_predict_8x8_vl_avx( uint8_t src, uint8_t edge[36] ); +void x264_predict_8x8_vl_sse2( pixel src, pixel edge[36] ); +void x264_predict_8x8_vl_ssse3( pixel src, pixel edge[36] ); +void x264_predict_8x8_vl_avx( pixel src, pixel edge[36] ); +void x264_predict_8x8_vl_mmx2( uint8_t src, uint8_t edge[36] ); void x264_predict_8x8_vr_mmx2( uint8_t src, uint8_t edge[36] ); void x264_predict_8x8_vr_sse2( pixel src, pixel edge[36] ); void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/quant-a.asm ^
@@ -138,11 +138,7 @@ psrad m1, 16 PSIGND m1, m0 mova [%1], m1 -%if %4 - por m5, m1 -%else - SWAP 5, 1 -%endif + ACCUM por, 5, 1, %4 %else ; !sse4 mova m0, [%1] ABSD m1, m0 @@ -156,11 +152,7 @@ psrld m1, 16 PSIGND m1, m0 mova [%1], m1 -%if %4 - por m5, m1 -%else - SWAP 5, 1 -%endif + ACCUM por, 5, 1, %4 %endif ; cpuflag %endmacro @@ -180,11 +172,7 @@ PSIGND m3, m1 mova [%1], m2 mova [%1+mmsize], m3 -%if %4 - por m5, m2 -%else - SWAP 5, 2 -%endif + ACCUM por, 5, 2, %4 por m5, m3 %else ; !sse4 QUANT_ONE_DC %1, %2, %3, %4 @@ -208,11 +196,7 @@ psrad m1, 16 PSIGND m1, m0 mova [%1], m1 -%if %4 - por m5, m1 -%else - SWAP 5, 1 -%endif + ACCUM por, 5, 1, %4 %endmacro %macro QUANT_TWO_AC 4 @@ -231,11 +215,7 @@ PSIGND m3, m1 mova [%1], m2 mova [%1+mmsize], m3 -%if %4 - por m5, m2 -%else - SWAP 5, 2 -%endif + ACCUM por, 5, 2, %4 por m5, m3 %else ; !sse4 QUANT_ONE_AC_MMX %1, %2, %3, %4 @@ -307,11 +287,7 @@ pmulhuw m0, %2 ; divide PSIGNW m0, m1 ; restore sign mova %1, m0 ; store -%if %4 - por m5, m0 -%else - SWAP 5, 0 -%endif + ACCUM por, 5, 0, %4 %endmacro %macro QUANT_TWO 7 @@ -327,13 +303,8 @@ PSIGNW m2, m3 mova %1, m0 mova %2, m2 -%if %7 - por m5, m0 - por m5, m2 -%else - SWAP 5, 0 + ACCUM por, 5, 0, %7 por m5, m2 -%endif %endmacro ;----------------------------------------------------------------------------- @@ -950,10 +921,10 @@ ;This is not true for score64. cglobal decimate_score%1, 1,3 %ifdef PIC - lea r10, [decimate_table4] - lea r11, [decimate_mask_table4] - %define table r10 - %define mask_table r11 + lea r4, [decimate_table4] + lea r5, [decimate_mask_table4] + %define table r4 + %define mask_table r5 %else %define table decimate_table4 %define mask_table decimate_mask_table4 @@ -1019,10 +990,10 @@ %macro DECIMATE8x8 0 %ifdef ARCH_X86_64 -cglobal decimate_score64, 1,4 +cglobal decimate_score64, 1,5 %ifdef PIC - lea r10, [decimate_table8] - %define table r10 + lea r4, [decimate_table8] + %define table r4 %else %define table decimate_table8 %endif @@ -1381,8 +1352,16 @@ movifnidn t1, r1mp pxor m2, m2 LAST_MASK %1, t5d, t0-(%1&1)SIZEOF_DCTCOEF, t4d - not t5d - shl t5d, 32-((%1+1)&~1) +%if %1==15 + shr t5d, 1 +%elif %1==8 + and t5d, 0xff +%elif %1==4 + and t5d, 0xf +%endif + xor t5d, (1<<%1)-1 + mov [t1+4], t5d + shl t5d, 32-%1 mov t4d, %1-1 LZCOUNT t3d, t5d, 0x1f xor t6d, t6d @@ -1394,12 +1373,12 @@ LZCOUNT t3d, t5d, 0x1f %ifdef HIGH_BIT_DEPTH mov t2d, [t0+t44] - mov [t1+t6 +4+164], t3b - mov [t1+t64+ 4], t2d + mov [t1+t6+8+164], t3b + mov [t1+t64+ 8], t2d %else mov t2w, [t0+t42] - mov [t1+t6 +4+162], t3b - mov [t1+t62+ 4], t2w + mov [t1+t6+8+162], t3b + mov [t1+t6*2+ 8], t2w %endif inc t3d shl t5d, t3b
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/quant.h ^
@@ -110,5 +110,17 @@ int x264_coeff_level_run8_mmx2_lzcnt( dctcoef dct, x264_run_level_t runlevel ); int x264_coeff_level_run8_sse2( dctcoef dct, x264_run_level_t runlevel ); int x264_coeff_level_run8_sse2_lzcnt( dctcoef dct, x264_run_level_t runlevel ); +int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); +int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac ); +int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced ); +int x264_trellis_cabac_8x8_ssse3( TRELLIS_PARAMS, int b_interlaced ); +int x264_trellis_cabac_4x4_psy_sse2 ( TRELLIS_PARAMS, int b_ac, dctcoef fenc_dct, int i_psy_trellis ); +int x264_trellis_cabac_4x4_psy_ssse3( TRELLIS_PARAMS, int b_ac, dctcoef fenc_dct, int i_psy_trellis ); +int x264_trellis_cabac_8x8_psy_sse2 ( TRELLIS_PARAMS, int b_interlaced, dctcoef fenc_dct, int i_psy_trellis ); +int x264_trellis_cabac_8x8_psy_ssse3( TRELLIS_PARAMS, int b_interlaced, dctcoef fenc_dct, int i_psy_trellis ); +int x264_trellis_cabac_dc_sse2 ( TRELLIS_PARAMS, int i_coefs ); +int x264_trellis_cabac_dc_ssse3( TRELLIS_PARAMS, int i_coefs ); +int x264_trellis_cabac_chroma_422_dc_sse2 ( TRELLIS_PARAMS ); +int x264_trellis_cabac_chroma_422_dc_ssse3( TRELLIS_PARAMS ); #endif
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/sad-a.asm ^
@@ -242,11 +242,7 @@ psadbw m1, m3 psadbw m2, m4 lea r2, [r2+2r3] -%if %1 - paddw m0, m1 -%else - SWAP 0, 1 -%endif + ACCUM paddw, 0, 1, %1 paddw m0, m2 %endmacro @@ -287,10 +283,11 @@ psadbw m3, m5 psadbw m4, m6 psadbw m5, m7 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - paddw m0, m5 + ;max sum: 3116255(pixel_max)=126480 + paddd m0, m2 + paddd m0, m3 + paddd m0, m4 + paddd m0, m5 mova m2, m6 mova m3, m7 sub r2d, 2 @@ -321,7 +318,8 @@ jg .loop .end: movhlps m1, m0 - paddw m0, m1 + ;max sum: 3116255(pixel_max)=126480 + paddd m0, m1 movd eax, m0 RET @@ -389,25 +387,13 @@ movq m5, [r0+FENC_STRIDE%1] movq m4, m5 psadbw m4, m0 -%if %1 - paddw m1, m4 -%else - SWAP 1, 4 -%endif + ACCUM paddw, 1, 4, %1 movq m4, m5 psadbw m4, m6 -%if %1 - paddw m2, m4 -%else - SWAP 2, 4 -%endif + ACCUM paddw, 2, 4, %1 pshufw m4, m7, %2 psadbw m5, m4 -%if %1 - paddw m3, m5 -%else - SWAP 3, 5 -%endif + ACCUM paddw, 3, 5, %1 %endmacro INIT_MMX @@ -465,13 +451,8 @@ psadbw m5, m6 paddw m1, m3 paddw m4, m5 -%if %1 - paddw m0, m1 - paddw m2, m4 -%else - SWAP 0,1 - SWAP 2,4 -%endif + ACCUM paddw, 0, 1, %1 + ACCUM paddw, 2, 4, %1 %endmacro %macro INTRA_SAD_8x8C 0 @@ -1436,7 +1417,7 @@ jmp pixel_sad_x3_%1x%2_%4 .split: %ifdef ARCH_X86_64 - PROLOGUE 6,7 + PROLOGUE 6,9 %ifdef WIN64 movsxd r4, r4d sub rsp, 8 @@ -1446,26 +1427,26 @@ mov r2, r1 mov r1, FENC_STRIDE mov r3, r4 - mov r10, r0 - mov r11, r5 + mov r7, r0 + mov r8, r5 call pixel_sad_%1x%2_cache%3_%5 - mov [r11], eax + mov [r8], eax %ifdef WIN64 mov r2, [rsp] %else pop r2 %endif - mov r0, r10 + mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 - mov [r11+4], eax + mov [r8+4], eax %ifdef WIN64 mov r2, [rsp+8] %else pop r2 %endif - mov r0, r10 + mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 - mov [r11+8], eax + mov [r8+8], eax %ifdef WIN64 add rsp, 24 %endif @@ -1502,8 +1483,8 @@ jmp pixel_sad_x4_%1x%2_%4 .split: %ifdef ARCH_X86_64 - PROLOGUE 6,7 - mov r11, r6mp + PROLOGUE 6,9 + mov r8, r6mp %ifdef WIN64 movsxd r5, r5d %endif @@ -1513,33 +1494,33 @@ mov r2, r1 mov r1, FENC_STRIDE mov r3, r5 - mov r10, r0 + mov r7, r0 call pixel_sad_%1x%2_cache%3_%5 - mov [r11], eax + mov [r8], eax %ifdef WIN64 mov r2, [rsp] %else pop r2 %endif - mov r0, r10 + mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 - mov [r11+4], eax + mov [r8+4], eax %ifdef WIN64 mov r2, [rsp+8] %else pop r2 %endif - mov r0, r10 + mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 - mov [r11+8], eax + mov [r8+8], eax %ifdef WIN64 mov r2, [rsp+16] %else pop r2 %endif - mov r0, r10 + mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 - mov [r11+12], eax + mov [r8+12], eax %ifdef WIN64 add rsp, 24 %endif
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/sad16-a.asm ^
@@ -29,6 +29,8 @@ SECTION .text cextern pw_1 +cextern pw_4 +cextern pw_8 ;============================================================================= ; SAD MMX @@ -347,6 +349,57 @@ %endrep %endmacro +%macro PIXEL_VSAD 0 +cglobal pixel_vsad, 3,3,8 + mova m0, [r0] + mova m1, [r0+16] + mova m2, [r0+2r1] + mova m3, [r0+2r1+16] + lea r0, [r0+4r1] + psubw m0, m2 + psubw m1, m3 + ABSW2 m0, m1, m0, m1, m4, m5 + paddw m0, m1 + sub r2d, 2 + je .end +.loop: + mova m4, [r0] + mova m5, [r0+16] + mova m6, [r0+2r1] + mova m7, [r0+2r1+16] + lea r0, [r0+4r1] + psubw m2, m4 + psubw m3, m5 + psubw m4, m6 + psubw m5, m7 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + ABSW m4, m4, m1 + ABSW m5, m5, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + mova m2, m6 + mova m3, m7 + sub r2d, 2 + jg .loop +.end: +%if BIT_DEPTH == 9 + HADDW m0, m1 ; max sum: 62(pixel diffs)511(pixel_max)=31682 +%else + HADDUW m0, m1 ; max sum: 62(pixel diffs)1023(pixel_max)=63426 +%endif + movd eax, m0 + RET +%endmacro +INIT_XMM sse2 +PIXEL_VSAD +INIT_XMM ssse3 +PIXEL_VSAD +INIT_XMM xop +PIXEL_VSAD + ;----------------------------------------------------------------------------- ; void pixel_sad_xK_MxN( uint16_t fenc, uint16_t pix0, uint16_t pix1, ; uint16_t pix2, int i_stride, int scores[3] ) @@ -418,3 +471,129 @@ SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 + +;----------------------------------------------------------------------------- +; void intra_sad_x3_4x4( uint16_t fenc, uint16_t fdec, int res[3] ); +;----------------------------------------------------------------------------- + +%macro INTRA_SAD_X3_4x4 0 +cglobal intra_sad_x3_4x4, 3,3,7 + movq m0, [r1-1FDEC_STRIDEB] + movq m1, [r0+0FENC_STRIDEB] + movq m2, [r0+2FENC_STRIDEB] + pshuflw m6, m0, q1032 + paddw m6, m0 + pshuflw m5, m6, q2301 + paddw m6, m5 + punpcklqdq m6, m6 ;A+B+C+D 8 times + punpcklqdq m0, m0 + movhps m1, [r0+1FENC_STRIDEB] + movhps m2, [r0+3FENC_STRIDEB] + psubw m3, m1, m0 + psubw m0, m2 + ABSW m3, m3, m5 + ABSW m0, m0, m5 + paddw m0, m3 + HADDW m0, m5 + movd [r2], m0 ;V prediction cost + movd m3, [r1+0FDEC_STRIDEB-4] + movhps m3, [r1+1FDEC_STRIDEB-8] + movd m4, [r1+2FDEC_STRIDEB-4] + movhps m4, [r1+3FDEC_STRIDEB-8] + pshufhw m3, m3, q3333 + pshufhw m4, m4, q3333 + pshuflw m3, m3, q1111 ; FF FF EE EE + pshuflw m4, m4, q1111 ; HH HH GG GG + paddw m5, m3, m4 + pshufd m0, m5, q1032 + paddw m5, m6 + paddw m5, m0 + paddw m5, [pw_4] + psrlw m5, 3 + psubw m6, m5, m2 + psubw m5, m1 + psubw m1, m3 + psubw m2, m4 + ABSW m5, m5, m0 + ABSW m6, m6, m0 + ABSW m1, m1, m0 + ABSW m2, m2, m0 + paddw m5, m6 + paddw m1, m2 + HADDW m5, m0 + HADDW m1, m2 + movd [r2+8], m5 ;DC prediction cost + movd [r2+4], m1 ;H prediction cost + RET +%endmacro + +INIT_XMM sse2 +INTRA_SAD_X3_4x4 +INIT_XMM ssse3 +INTRA_SAD_X3_4x4 +INIT_XMM avx +INTRA_SAD_X3_4x4 + +;----------------------------------------------------------------------------- +; void intra_sad_x3_8x8( pixel fenc, pixel edge[36], int res[3] ); +;----------------------------------------------------------------------------- + +;m0 = DC +;m6 = V +;m7 = H +;m1 = DC score +;m2 = V score +;m3 = H score +;m5 = temp +;m4 = pixel row + +%macro INTRA_SAD_HVDC_ITER 2 + mova m4, [r0+(%1-4)FENC_STRIDEB] + psubw m4, m0 + ABSW m4, m4, m5 + ACCUM paddw, 1, 4, %1 + mova m4, [r0+(%1-4)FENC_STRIDEB] + psubw m4, m6 + ABSW m4, m4, m5 + ACCUM paddw, 2, 4, %1 + pshufd m5, m7, %2 + psubw m5, [r0+(%1-4)FENC_STRIDEB] + ABSW m5, m5, m4 + ACCUM paddw, 3, 5, %1 +%endmacro + +%macro INTRA_SAD_X3_8x8 0 +cglobal intra_sad_x3_8x8, 3,3,8 + add r0, 4FENC_STRIDEB + movu m0, [r1+7SIZEOF_PIXEL] + mova m6, [r1+16SIZEOF_PIXEL] ;V prediction + mova m7, m0 + paddw m0, m6 + punpckhwd m7, m7 + HADDW m0, m4 + paddw m0, [pw_8] + psrlw m0, 4 + SPLATW m0, m0 + INTRA_SAD_HVDC_ITER 0, q3333 + INTRA_SAD_HVDC_ITER 1, q2222 + INTRA_SAD_HVDC_ITER 2, q1111 + INTRA_SAD_HVDC_ITER 3, q0000 + movq m7, [r1+7*SIZEOF_PIXEL] + punpcklwd m7, m7 + INTRA_SAD_HVDC_ITER 4, q3333 + INTRA_SAD_HVDC_ITER 5, q2222 + INTRA_SAD_HVDC_ITER 6, q1111 + INTRA_SAD_HVDC_ITER 7, q0000 + HADDW m2, m4 + HADDW m3, m4 + HADDW m1, m4 + movd [r2+0], m2 + movd [r2+4], m3 + movd [r2+8], m1 + RET +%endmacro + +INIT_XMM sse2 +INTRA_SAD_X3_8x8 +INIT_XMM ssse3 +INTRA_SAD_X3_8x8
[-] [+]	Added	x264-snapshot-20120126-2245.tar.bz2/common/x86/trellis-64.asm ^
@@ -0,0 +1,890 @@ +;***************************************************************************** +;* trellis-64.asm: x86_64 trellis quantization +;***************************************************************************** +;* Copyright (C) 2012 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licensing@x264.com. +;***************************************************************************** + +; This is a pretty straight-forward translation of the C code, except: +; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level. +; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop: +; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those +; nodes are invalid). +; * Interprocedural register allocation. Eliminates argument-passing overhead +; to trellis_coef* subroutines. Also reduces codesize. + +; Optimizations that I tried, and rejected because they were not faster: +; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3]. +; Costs too much icache compared to the negligible speedup. +; * There are only 21 possible sets of live node_ctxs; we could keep track of +; exactly which set we're in and feed that (along with abs_level) into a jump +; table instead of the switch to select a trellis_coef subroutine. This would +; eliminate all branches about which node_ctxs are live, but costs either a +; bunch of icache or a bunch of call/ret, and the jump table itself is +; unpredictable. +; * Separate versions of trellis_coef* depending on whether we're doing the 1st +; or the 2nd of the two abs_level candidates. This would eliminate some +; branches about if(score is better). +; * Special case more values of coef. I had a coef2 at some intermediate point +; in the optimization process, but it didn't end up worthwhile in conjunction +; with all the other optimizations. +; * Unroll or simd writeback. I don't know why this didn't help. + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +pd_8: times 4 dd 8 +pd_m16: times 4 dd -16 +pd_0123: dd 0, 1, 2, 3 +pd_4567: dd 4, 5, 6, 7 +sq_1: dq 1, 0 +pq_128: times 2 dq 128 +pq_ffffffff: times 2 dq 0xffffffff + +cextern cabac_entropy +cextern cabac_transition +cextern cabac_size_unary +cextern cabac_transition_unary +cextern dct4_weight_tab +cextern dct8_weight_tab +cextern dct4_weight2_tab +cextern dct8_weight2_tab +cextern last_coeff_flag_offset_8x8 +cextern significant_coeff_flag_offset_8x8 +cextern coeff_flag_offset_chroma_422_dc + +SECTION .text + +%define TRELLIS_SCORE_BIAS 1<<60 +%define SIZEOF_NODE 16 +%define CABAC_SIZE_BITS 8 +%define LAMBDA_BITS 4 + +%macro SQUARE 2 ; dst, tmp + ; could use pmuldq here, to eliminate the abs. but that would involve + ; templating a sse4 version of all of trellis, for negligible speedup. +%if cpuflag(ssse3) + pabsd m%1, m%1 + pmuludq m%1, m%1 +%elifdef HIGH_BIT_DEPTH + ABSD m%2, m%1 + SWAP %1, %2 + pmuludq m%1, m%1 +%else + pmuludq m%1, m%1 + pand m%1, [pq_ffffffff] +%endif +%endmacro + +;----------------------------------------------------------------------------- +; int trellis_cabac_4x4_psy( +; const int unquant_mf, const uint8_t zigzag, int lambda2, +; int last_nnz, dctcoef orig_coefs, dctcoef quant_coefs, dctcoef dct, +; uint8_t cabac_state_sig, uint8_t cabac_state_last, +; uint64_t level_state0, uint16_t level_state1, +; int b_ac, dctcoef fenc_dct, int psy_trellis ) +;----------------------------------------------------------------------------- +%macro TRELLIS 4 +%define num_coefs %2 +%define dc %3 +%define psy %4 +cglobal %1, 4,15,9 + %assign level_tree_size 64824 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef + %assign pad 96 + level_tree_size + 16SIZEOF_NODE + 16-gprsize-(stack_offset&15) + SUB rsp, pad + DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last +%ifdef WIN64 + %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array +%else + %define level_statem rsp+stack_offset+32 +%endif + %define b_acm r11m ; 4x4 only + %define b_interlacedm r11m ; 8x8 only + %define i_coefsm1 r11m ; dc only + %define fenc_dctm r12m + %define psy_trellism r13m +%if num_coefs == 64 + shl dword b_interlacedm, 6 + %define dct_weight1_tab dct8_weight_tab + %define dct_weight2_tab dct8_weight2_tab +%else + %define dct_weight1_tab dct4_weight_tab + %define dct_weight2_tab dct4_weight2_tab +%endif + + %define stack rsp + %define last_nnzm [stack+0] + %define zigzagm [stack+8] + mov last_nnzm, iid + mov zigzagm, zigzagq +%ifndef WIN64 + %define orig_coefsm [stack+16] + %define quant_coefsm [stack+24] + mov orig_coefsm, orig_coefsq + mov quant_coefsm, quant_coefsq +%endif + %define unquant_mfm [stack+32] + %define levelgt1_ctxm [stack+40] + %define ssd stack+48 + %define cost_siglast stack+80 + %define level_tree stack+96 + + ; trellis_node_t is layed out differently than C. + ; struct-of-arrays rather than array-of-structs, for simd. + %define nodes_curq r7 + %define nodes_prevq r8 + %define node_score(x) x8 + %define node_level_idx(x) 64+x4 + %define node_cabac_state(x) 96+x4 + lea nodes_curq, [level_tree + level_tree_size] + lea nodes_prevq, [nodes_curq + 8SIZEOF_NODE] + mov r6, TRELLIS_SCORE_BIAS + mov [nodes_curq + node_score(0)], r6 + mov dword [nodes_curq + node_level_idx(0)], 0 + movd mm0, [level_statem + 0] + punpcklbw mm0, [level_statem + 4] + punpcklwd mm0, [level_statem + 8] + %define level_state_packed mm0 ; version for copying into node.cabac_state + pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX + movq [nodes_curq + node_score(1)], m7 + mova [nodes_curq + node_score(2)], m7 + + %define levels_usedq r4 + %define levels_usedd r4d + mov dword [level_tree], 0 + mov levels_usedd, 1 + + %define abs_levelq r9 + %define abs_leveld r9d + %define abs_coefq r14 + %define zigzagiq r5 + %define zigzagid r5d + +%if num_coefs == 8 + mov dword levelgt1_ctxm, 8 +%else + mov dword levelgt1_ctxm, 9 +%endif +%if psy + movd m6, psy_trellism + %define psy_trellis m6 +%elif dc + movd m6, [unquant_mfq] + paddd m6, m6 + punpcklqdq m6, m6 + %define unquant_mf m6 +%endif +%ifdef PIC +%if dc == 0 + mov unquant_mfm, unquant_mfq +%endif + ; Keep a single offset register to PICify all global constants. + ; They're all relative to "beginning of this asm file's .text section", + ; even tables that aren't in this file. + ; (Any address in .text would work, this one was just convenient.) + lea r0, [$$] + %define GLOBAL +r0-$$ +%else + %define GLOBAL +%endif + + TRELLIS_LOOP 0 ; node_ctx 0..3 + TRELLIS_LOOP 1 ; node_ctx 1..7 + +.writeback: + ; int level = bnode->level_idx; + ; for( int i = b_ac; i <= last_nnz; i++ ) + ; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]); + ; level = level_tree[level].next; + mov iid, last_nnzm + add zigzagq, iiq + neg iiq +%if num_coefs == 16 && dc == 0 + mov r2d, b_acm + add iiq, r2 +%endif + %define dctq r10 + mov r0d, [nodes_curq + node_level_idx(0) + rax4] +.writeback_loop: + movzx r2, byte [zigzagq + iiq] +%if cpuflag(ssse3) + movd m0, [level_tree + r04] + movzx r0, word [level_tree + r04] + psrld m0, 16 + movd m1, [dctq + r2SIZEOF_DCTCOEF] +%ifdef HIGH_BIT_DEPTH + psignd m0, m1 + movd [dctq + r2SIZEOF_DCTCOEF], m0 +%else + psignw m0, m1 + movd r4d, m0 + mov [dctq + r2SIZEOF_DCTCOEF], r4w +%endif +%else + mov r5d, [level_tree + r04] +%ifdef HIGH_BIT_DEPTH + mov r4d, dword [dctq + r2SIZEOF_DCTCOEF] +%else + movsx r4d, word [dctq + r2SIZEOF_DCTCOEF] +%endif + movzx r0d, r5w + sar r4d, 31 + shr r5d, 16 + xor r5d, r4d + sub r5d, r4d +%ifdef HIGH_BIT_DEPTH + mov [dctq + r2SIZEOF_DCTCOEF], r5d +%else + mov [dctq + r2SIZEOF_DCTCOEF], r5w +%endif +%endif + inc iiq + jle .writeback_loop + + mov eax, 1 +.return: + ADD rsp, pad + RET + +%if num_coefs == 16 && dc == 0 +.return_zero: + pxor m0, m0 + mova [r10+ 0], m0 + mova [r10+16], m0 +%ifdef HIGH_BIT_DEPTH + mova [r10+32], m0 + mova [r10+48], m0 +%endif + jmp .return +%endif +%endmacro ; TRELLIS + + + +%macro TRELLIS_LOOP 1 ; ctx_hi +.i_loop%1: + ; if( !quant_coefs[i] ) + mov r6, quant_coefsm +%ifdef HIGH_BIT_DEPTH + mov abs_leveld, dword [r6 + iiqSIZEOF_DCTCOEF] +%else + movsx abs_leveld, word [r6 + iiqSIZEOF_DCTCOEF] +%endif + + ; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : + ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; + mov r10, cabac_state_sigm +%if num_coefs == 64 + mov r6d, b_interlacedm +%ifdef PIC + add r6d, iid + movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL] +%else + movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq] +%endif + movzx r10, byte [r10 + r6] +%elif num_coefs == 8 + movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL] + movzx r10, byte [r10 + r13] +%else + movzx r10, byte [r10 + iiq] +%endif + + test abs_leveld, abs_leveld + jnz %%.nonzero_quant_coef + +%if %1 == 0 + ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ) + ; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); + ; nodes_cur[0].score -= cost_sig0; + movzx r10, word [cabac_entropy + r102 GLOBAL] + imul r10, lambda2q + shr r10, CABAC_SIZE_BITS - LAMBDA_BITS + sub [nodes_curq + node_score(0)], r10 +%endif + ZERO_LEVEL_IDX %1, cur + jmp .i_continue%1 + +%%.nonzero_quant_coef: + ; int sign_coef = orig_coefs[zigzag[i]]; + ; int abs_coef = abs( sign_coef ); + ; int q = abs( quant_coefs[i] ); + movzx zigzagid, byte [zigzagq+iiq] + movd m0, abs_leveld + mov r6, orig_coefsm +%ifdef HIGH_BIT_DEPTH + movd m1, [r6 + zigzagiqSIZEOF_DCTCOEF] +%else + movd m1, [r6 + zigzagiqSIZEOF_DCTCOEF - 2] + psrad m1, 16 +%endif + punpcklqdq m0, m0 ; quant_coef + punpcklqdq m1, m1 ; sign_coef +%if cpuflag(ssse3) + pabsd m0, m0 + pabsd m2, m1 ; abs_coef +%else + pxor m8, m8 + pcmpgtd m8, m1 ; sign_mask + pxor m0, m8 + pxor m2, m1, m8 + psubd m0, m8 + psubd m2, m8 +%endif + psubd m0, [sq_1] ; abs_level + movd abs_leveld, m0 + + xchg nodes_curq, nodes_prevq + + ; if( i < num_coefs-1 ) + ; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i; + ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i + ; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ); + ; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 ); + ; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1; + ; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1; +%if %1 == 0 +%if dc && num_coefs != 8 + cmp iid, i_coefsm1 +%else + cmp iid, num_coefs-1 +%endif + je %%.zero_siglast +%endif + movzx r11, word [cabac_entropy + r102 GLOBAL] + xor r10, 1 + movzx r12, word [cabac_entropy + r102 GLOBAL] + mov [cost_siglast+0], r11d + mov r10, cabac_state_lastm +%if num_coefs == 64 + movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL] + movzx r10, byte [r10 + r6] +%elif num_coefs == 8 + movzx r10, byte [r10 + r13] +%else + movzx r10, byte [r10 + iiq] +%endif + movzx r11, word [cabac_entropy + r102 GLOBAL] + add r11, r12 + mov [cost_siglast+4], r11d +%if %1 == 0 + xor r10, 1 + movzx r10, word [cabac_entropy + r102 GLOBAL] + add r10, r12 + mov [cost_siglast+8], r10d +%endif +%%.skip_siglast: + + ; int unquant_abs_level = ((unquant_mf[zigzag[i]] abs_level + 128) >> 8); + ; int d = abs_coef - unquant_abs_level; + ; uint64_t ssd = (int64_t)dd coef_weight[i]; +%if dc + pmuludq m0, unquant_mf +%else +%ifdef PIC + mov r10, unquant_mfm + movd m3, [r10 + zigzagiq4] +%else + movd m3, [unquant_mfq + zigzagiq4] +%endif + punpcklqdq m3, m3 + pmuludq m0, m3 +%endif + paddd m0, [pq_128] + psrld m0, 8 ; unquant_abs_level +%if psy \|\| dc == 0 + mova m4, m0 +%endif + psubd m0, m2 + SQUARE 0, 3 +%if dc + psllq m0, 8 +%else + movd m5, [dct_weight2_tab + zigzagiq4 GLOBAL] + punpcklqdq m5, m5 + pmuludq m0, m5 +%endif + +%if psy + test iid, iid + jz %%.dc_rounding + ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef + ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef)); + ; int psy_weight = dct_weight_tab[zigzag[i]] h->mb.i_psy_trellis; + ; ssd1[k] -= psy_weight * psy_value; + mov r6, fenc_dctm +%ifdef HIGH_BIT_DEPTH + movd m3, [r6 + zigzagiqSIZEOF_DCTCOEF] +%else + movd m3, [r6 + zigzagiqSIZEOF_DCTCOEF - 2] + psrad m3, 16 ; orig_coef +%endif + punpcklqdq m3, m3 +%if cpuflag(ssse3) + psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef) +%else + PSIGN d, m4, m8 +%endif + psubd m3, m1 ; predicted_coef + paddd m4, m3 +%if cpuflag(ssse3) + pabsd m4, m4 +%else + ABSD m3, m4 + SWAP 4, 3 +%endif + movd m1, [dct_weight1_tab + zigzagiq4 GLOBAL] + pmuludq m1, psy_trellis + punpcklqdq m1, m1 + pmuludq m4, m1 + psubq m0, m4 +%if %1 +%%.dc_rounding: +%endif +%endif +%if %1 == 0 + mova [ssd], m0 +%endif + +%if dc == 0 && %1 == 0 + test iid, iid + jnz %%.skip_dc_rounding +%%.dc_rounding: + ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. + ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15); + ; uint64_t ssd = (int64_t)dd * coef_weight[i]; + psrad m1, 31 ; sign_coef>>31 + paddd m4, [pd_8] + paddd m4, m1 + pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15 + psubd m4, m2 ; d + SQUARE 4, 3 + pmuludq m4, m5 + mova [ssd], m4 +%%.skip_dc_rounding: +%endif + mova [ssd+16], m0 + + %assign stack_offset_bak stack_offset + cmp abs_leveld, 1 + jl %%.switch_coef0 +%if %1 == 0 + mov r10, [ssd] ; trellis_coef* args +%endif + movq r12, m0 + ; for( int j = 0; j < 8; j++ ) + ; nodes_cur[j].score = TRELLIS_SCORE_MAX; +%if cpuflag(ssse3) + mova [nodes_curq + node_score(0)], m7 + mova [nodes_curq + node_score(2)], m7 +%else ; avoid store-forwarding stalls on k8/k10 +%if %1 == 0 + movq [nodes_curq + node_score(0)], m7 +%endif + movq [nodes_curq + node_score(1)], m7 + movq [nodes_curq + node_score(2)], m7 + movq [nodes_curq + node_score(3)], m7 +%endif + mova [nodes_curq + node_score(4)], m7 + mova [nodes_curq + node_score(6)], m7 + je %%.switch_coef1 +%%.switch_coefn: + call trellis_coefn.entry%1 + call trellis_coefn.entry%1b + jmp .i_continue1 +%%.switch_coef1: + call trellis_coef1.entry%1 + call trellis_coefn.entry%1b + jmp .i_continue1 +%%.switch_coef0: + call trellis_coef0_%1 + call trellis_coef1.entry%1b + +.i_continue%1: + dec iid +%if num_coefs == 16 && dc == 0 + cmp iid, b_acm +%endif + jge .i_loop%1 + + call trellis_bnode_%1 +%if %1 == 0 +%if num_coefs == 16 && dc == 0 + jz .return_zero +%else + jz .return +%endif + jmp .writeback + +%%.zero_siglast: + xor r6d, r6d + mov [cost_siglast+0], r6 + mov [cost_siglast+8], r6d + jmp %%.skip_siglast +%endif +%endmacro ; TRELLIS_LOOP + +; just a synonym for %if +%macro IF0 1+ +%endmacro +%macro IF1 1+ + %1 +%endmacro + +%macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev + ; for( int j = 0; j < 8; j++ ) + ; nodes_cur[j].level_idx = levels_used; + ; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 }; + ; levels_used++; + add levels_usedd, 3 + and levels_usedd, ~3 ; allow aligned stores + movd m0, levels_usedd + pshufd m0, m0, 0 + IF%1 mova m1, m0 + paddd m0, [pd_0123] + IF%1 paddd m1, [pd_4567] + mova m2, [nodes_%2q + node_level_idx(0)] + IF%1 mova m3, [nodes_%2q + node_level_idx(4)] + mova [nodes_curq + node_level_idx(0)], m0 + IF%1 mova [nodes_curq + node_level_idx(4)], m1 + mova [level_tree + (levels_usedq+0)4], m2 + IF%1 mova [level_tree + (levels_usedq+4)4], m3 + add levels_usedd, (1+%1)4 +%endmacro + +INIT_XMM sse2 +TRELLIS trellis_cabac_4x4, 16, 0, 0 +TRELLIS trellis_cabac_8x8, 64, 0, 0 +TRELLIS trellis_cabac_4x4_psy, 16, 0, 1 +TRELLIS trellis_cabac_8x8_psy, 64, 0, 1 +TRELLIS trellis_cabac_dc, 16, 1, 0 +TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0 +INIT_XMM ssse3 +TRELLIS trellis_cabac_4x4, 16, 0, 0 +TRELLIS trellis_cabac_8x8, 64, 0, 0 +TRELLIS trellis_cabac_4x4_psy, 16, 0, 1 +TRELLIS trellis_cabac_8x8_psy, 64, 0, 1 +TRELLIS trellis_cabac_dc, 16, 1, 0 +TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0 + + + +%define stack rsp+gprsize +%define scoreq r14 +%define bitsq r13 +%define bitsd r13d + +INIT_XMM +%macro clocal 1 + ALIGN 16 + global mangle(x264_%1) + mangle(x264_%1): + %1: + %assign stack_offset stack_offset_bak+gprsize +%endmacro + +%macro TRELLIS_BNODE 1 ; ctx_hi +clocal trellis_bnode_%1 + ; int j = ctx_hi?1:0; + ; trellis_node_t bnode = &nodes_cur[j]; + ; while( ++j < (ctx_hi?8:4) ) + ; if( nodes_cur[j].score < bnode->score ) + ; bnode = &nodes_cur[j]; +%assign j %1 + mov rax, [nodes_curq + node_score(j)] + lea rax, [rax8 + j] +%rep 3+3%1 +%assign j j+1 + mov r11, [nodes_curq + node_score(j)] + lea r11, [r118 + j] + cmp rax, r11 + cmova rax, r11 +%endrep + mov r10, dctm + and eax, 7 + ret +%endmacro ; TRELLIS_BNODE +TRELLIS_BNODE 0 +TRELLIS_BNODE 1 + + +%macro TRELLIS_COEF0 1 ; ctx_hi +clocal trellis_coef0_%1 + ; ssd1 += (uint64_t)cost_sig lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); + mov r11d, [cost_siglast+0] + imul r11, lambda2q + shr r11, CABAC_SIZE_BITS - LAMBDA_BITS + add r11, [ssd+16] +%if %1 == 0 + ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1; + mov scoreq, [nodes_prevq + node_score(0)] + add scoreq, [ssd] + sub scoreq, r11 + mov [nodes_curq + node_score(0)], scoreq +%endif + ; memcpy + mov scoreq, [nodes_prevq + node_score(1)] + mov [nodes_curq + node_score(1)], scoreq + mova m1, [nodes_prevq + node_score(2)] + mova [nodes_curq + node_score(2)], m1 +%if %1 + mova m1, [nodes_prevq + node_score(4)] + mova [nodes_curq + node_score(4)], m1 + mova m1, [nodes_prevq + node_score(6)] + mova [nodes_curq + node_score(6)], m1 +%endif + mov r6d, [nodes_prevq + node_cabac_state(3)] + mov [nodes_curq + node_cabac_state(3)], r6d +%if %1 + mova m1, [nodes_prevq + node_cabac_state(4)] + mova [nodes_curq + node_cabac_state(4)], m1 +%endif + ZERO_LEVEL_IDX %1, prev + ret +%endmacro ; TRELLIS_COEF0 +TRELLIS_COEF0 0 +TRELLIS_COEF0 1 + + + +%macro START_COEF 1 ; gt1 + ; if( (int64_t)nodes_prev[0].score < 0 ) continue; + mov scoreq, [nodes_prevq + node_score(j)] +%if j > 0 + test scoreq, scoreq + js .ctx %+ nextj_if_invalid +%endif + + ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 ); +%if j >= 3 + movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4 + movzx r11, byte [cabac_transition + r62 + %1 GLOBAL] +%else + movzx r6d, byte [level_statem + coeff_abs_level1_offs] +%endif +%if %1 + xor r6d, 1 +%endif + movzx bitsd, word [cabac_entropy + r62 GLOBAL] + + ; n.score += ssd; + ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ]; +%if j == 0 + add scoreq, r10 + add bitsd, [cost_siglast+8] +%else + add scoreq, r12 + add bitsd, [cost_siglast+4] +%endif +%endmacro ; START_COEF + +%macro END_COEF 1 + ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); + imul bitsq, lambda2q + shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS + add scoreq, bitsq + + ; if( n.score < nodes_cur[node_ctx].score ) + ; SET_LEVEL( n, abs_level ); + ; nodes_cur[node_ctx] = n; + cmp scoreq, [nodes_curq + node_score(node_ctx)] + jae .ctx %+ nextj_if_valid + mov [nodes_curq + node_score(node_ctx)], scoreq +%if j == 2 \|\| (j <= 3 && node_ctx == 4) + ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states + movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed +%elif j >= 3 + ; if we have updated before, then copy cabac_state from the parent node + mov r6d, [nodes_prevq + node_cabac_state(j)] + mov [nodes_curq + node_cabac_state(node_ctx)], r6d +%endif +%if j >= 3 ; skip the transition if we're not going to reuse the context + mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2 +%endif +%if %1 && node_ctx == 7 + mov r6d, levelgt1_ctxm + mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b +%endif + mov r6d, [nodes_prevq + node_level_idx(j)] +%if %1 + mov r11d, abs_leveld + shl r11d, 16 + or r6d, r11d +%else + or r6d, 1<<16 +%endif + mov [level_tree + levels_usedq4], r6d + mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd + inc levels_usedd +%endmacro ; END_COEF + + + +%macro COEF1 2 + %assign j %1 + %assign nextj_if_valid %1+1 + %assign nextj_if_invalid %2 +%if j < 4 + %assign coeff_abs_level1_offs j+1 +%else + %assign coeff_abs_level1_offs 0 +%endif +%if j < 3 + %assign node_ctx j+1 +%else + %assign node_ctx j +%endif +.ctx %+ j: + START_COEF 0 + add bitsd, 1 << CABAC_SIZE_BITS + END_COEF 0 +%endmacro ; COEF1 + +%macro COEFN 2 + %assign j %1 + %assign nextj_if_valid %2 + %assign nextj_if_invalid %2 +%if j < 4 + %assign coeff_abs_level1_offs j+1 + %assign coeff_abs_levelgt1_offs 5 +%else + %assign coeff_abs_level1_offs 0 + %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc +%endif +%if j < 4 + %assign node_ctx 4 +%elif j < 7 + %assign node_ctx j+1 +%else + %assign node_ctx 7 +%endif +.ctx %+ j: + START_COEF 1 + ; if( abs_level >= 15 ) + ; bits += bs_size_ue_big(...) + add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX + ; n.cabac_state[levelgt1_ctx] +%if j == 7 ; && compiling support for 4:2:2 + mov r6d, levelgt1_ctxm + %define coeff_abs_levelgt1_offs r6 +%endif +%if j == 7 + movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9 +%else + movzx r10, byte [level_statem + coeff_abs_levelgt1_offs] +%endif + ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]]; + add r10d, r1d + movzx r6d, word [cabac_size_unary + (r10-128)2 GLOBAL] + add bitsd, r6d +%if node_ctx == 7 + movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL] +%endif + END_COEF 1 +%endmacro ; COEFN + + + +clocal trellis_coef1 +.entry0b: ; ctx_lo, larger of the two abs_level candidates + mov r10, [ssd+8] + sub r10, r11 + mov r12, [ssd+24] + sub r12, r11 +.entry0: ; ctx_lo, smaller of the two abs_level candidates + COEF1 0, 4 + COEF1 1, 4 + COEF1 2, 4 + COEF1 3, 4 +.ctx4: + rep ret +.entry1b: ; ctx_hi, larger of the two abs_level candidates + mov r12, [ssd+24] + sub r12, r11 +.entry1: ; ctx_hi, smaller of the two abs_level candidates +trellis_coef1_hi: + COEF1 1, 2 + COEF1 2, 3 + COEF1 3, 4 + COEF1 4, 5 + COEF1 5, 6 + COEF1 6, 7 + COEF1 7, 8 +.ctx8: + rep ret + +%macro COEFN_PREFIX 1 + ; int prefix = X264_MIN( abs_level - 1, 14 ); + mov r1d, abs_leveld + cmp abs_leveld, 15 + jge .level_suffix%1 + xor r5d, r5d +.skip_level_suffix%1: + shl r1d, 7 +%endmacro + +%macro COEFN_SUFFIX 1 +.level_suffix%1: + ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS; + lea r5d, [abs_levelq-14] + bsr r5d, r5d + shl r5d, CABAC_SIZE_BITS+1 + add r5d, 1<<CABAC_SIZE_BITS + ; int prefix = X264_MIN( abs_level - 1, 14 ); + mov r1d, 15 + jmp .skip_level_suffix%1 +%endmacro + +clocal trellis_coefn +.entry0b: + mov r10, [ssd+8] + mov r12, [ssd+24] + inc abs_leveld +.entry0: + ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then + ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints + ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize. + ; I can't do both, as return-on-first-failure doesn't work for ctx_hi. + ; The C version has to be fully separate since C doesn't support multiple + ; entrypoints. But return-on-first-failure isn't very important here (as + ; opposed to coef1), so I might as well reduce codesize. + COEFN_PREFIX 0 + COEFN 0, 1 + COEFN 1, 2 + COEFN 2, 3 + COEFN 3, 8 +.ctx8: + mov zigzagq, zigzagm ; unspill since r1 was clobbered + ret +.entry1b: + mov r12, [ssd+24] + inc abs_leveld +.entry1: + COEFN_PREFIX 1 + COEFN 4, 5 + COEFN 5, 6 + COEFN 6, 7 + COEFN 7, 1 + jmp .ctx1 + COEFN_SUFFIX 0 + COEFN_SUFFIX 1
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/x86inc.asm ^
@@ -1,11 +1,12 @@ ;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project +;* Copyright (C) 2005-2012 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> ;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Henrik Gramner <hengar-6@student.ltu.se> ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above @@ -91,6 +92,9 @@ default rel %endif +; Always use long nops (reduces 0x90 spam in disassembly on x86_32) +CPU intelnop + ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that @@ -124,18 +128,20 @@ ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size -%macro DECLARE_REG 6 +%macro DECLARE_REG 5-6 %define r%1q %2 %define r%1d %3 %define r%1w %4 %define r%1b %5 - %define r%1m %6 - %ifid %6 ; i.e. it's a register + %if %0 == 5 + %define r%1m %3 %define r%1mp %2 %elifdef ARCH_X86_64 ; memory - %define r%1mp qword %6 + %define r%1m [rsp + stack_offset + %6] + %define r%1mp qword r %+ %1m %else - %define r%1mp dword %6 + %define r%1m [esp + stack_offset + %6] + %define r%1mp dword r %+ %1m %endif %define r%1 %2 %endmacro @@ -183,7 +189,7 @@ %endrep %endmacro -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %ifdef ARCH_X86_64 %define gprsize 8 @@ -201,6 +207,33 @@ %assign stack_offset stack_offset-gprsize %endmacro +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + %macro SUB 2 sub %1, %2 %ifidn %1, rsp @@ -247,6 +280,8 @@ %endrep %endif + %assign %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine %assign %%i 0 %rep %0 %xdefine %1q r %+ %%i %+ q @@ -258,40 +293,36 @@ %assign %%i %%i+1 %rotate 1 %endrep - %assign n_arg_names %%i + %assign stack_offset %%stack_offset + %assign n_arg_names %0 %endmacro %ifdef WIN64 ; Windows x64 ;================================================= -DECLARE_REG 0, rcx, ecx, cx, cl, ecx -DECLARE_REG 1, rdx, edx, dx, dl, edx -DECLARE_REG 2, r8, r8d, r8w, r8b, r8d -DECLARE_REG 3, r9, r9d, r9w, r9b, r9d -DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] -DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] -DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] -%define r7m [rsp + stack_offset + 64] -%define r8m [rsp + stack_offset + 72] - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [rsp + stack_offset + 8 + %18] - %endif -%endmacro +DECLARE_REG 0, rcx, ecx, cx, cl +DECLARE_REG 1, rdx, edx, dx, dl +DECLARE_REG 2, R8, R8D, R8W, R8B +DECLARE_REG 3, R9, R9D, R9W, R9B +DECLARE_REG 4, R10, R10D, R10W, R10B, 40 +DECLARE_REG 5, R11, R11D, R11W, R11B, 48 +DECLARE_REG 6, rax, eax, ax, al, 56 +DECLARE_REG 7, rdi, edi, di, dil, 64 +DECLARE_REG 8, rsi, esi, si, sil, 72 +DECLARE_REG 9, rbx, ebx, bx, bl, 80 +DECLARE_REG 10, rbp, ebp, bp, bpl, 88 +DECLARE_REG 11, R12, R12D, R12W, R12B, 96 +DECLARE_REG 12, R13, R13D, R13W, R13B, 104 +DECLARE_REG 13, R14, R14D, R14W, R14B, 112 +DECLARE_REG 14, R15, R15D, R15W, R15B, 120 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... - ASSERT %2 >= %1 + %assign num_args %1 %assign regs_used %2 - ASSERT regs_used <= 7 - %if regs_used > 4 - push r4 - push r5 - %assign stack_offset stack_offset+16 - %endif + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 WIN64_SPILL_XMM %3 - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS %4 %endmacro @@ -302,12 +333,11 @@ %endif ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 - sub rsp, (xmm_regs_used-6)16+16 - %assign stack_offset stack_offset+(xmm_regs_used-6)16+16 + SUB rsp, (xmm_regs_used-6)16+16 %assign %%i xmm_regs_used %rep (xmm_regs_used-6) %assign %%i %%i-1 - movdqa [rsp + (%%i-6)16+8], xmm %+ %%i + movdqa [rsp + (%%i-6)16+(~stack_offset&8)], xmm %+ %%i %endrep %endif %endmacro @@ -317,7 +347,7 @@ %assign %%i xmm_regs_used %rep (xmm_regs_used-6) %assign %%i %%i-1 - movdqa xmm %+ %%i, [%1 + (%%i-6)16+8] + movdqa xmm %+ %%i, [%1 + (%%i-6)16+(~stack_offset&8)] %endrep add %1, (xmm_regs_used-6)16+16 %endif @@ -331,15 +361,12 @@ %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp - %if regs_used > 4 - pop r5 - pop r4 - %endif + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 ret %endmacro %macro REP_RET 0 - %if regs_used > 4 \|\| xmm_regs_used > 6 + %if regs_used > 7 \|\| xmm_regs_used > 6 RET %else rep ret @@ -348,92 +375,80 @@ %elifdef ARCH_X86_64 ; nix x64 ;============================================= -DECLARE_REG 0, rdi, edi, di, dil, edi -DECLARE_REG 1, rsi, esi, si, sil, esi -DECLARE_REG 2, rdx, edx, dx, dl, edx -DECLARE_REG 3, rcx, ecx, cx, cl, ecx -DECLARE_REG 4, r8, r8d, r8w, r8b, r8d -DECLARE_REG 5, r9, r9d, r9w, r9b, r9d -DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] -%define r7m [rsp + stack_offset + 16] -%define r8m [rsp + stack_offset + 24] - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [rsp - 40 + %18] - %endif -%endmacro +DECLARE_REG 0, rdi, edi, di, dil +DECLARE_REG 1, rsi, esi, si, sil +DECLARE_REG 2, rdx, edx, dx, dl +DECLARE_REG 3, rcx, ecx, cx, cl +DECLARE_REG 4, R8, R8D, R8W, R8B +DECLARE_REG 5, R9, R9D, R9W, R9B +DECLARE_REG 6, rax, eax, ax, al, 8 +DECLARE_REG 7, R10, R10D, R10W, R10B, 16 +DECLARE_REG 8, R11, R11D, R11W, R11B, 24 +DECLARE_REG 9, rbx, ebx, bx, bl, 32 +DECLARE_REG 10, rbp, ebp, bp, bpl, 40 +DECLARE_REG 11, R12, R12D, R12W, R12B, 48 +DECLARE_REG 12, R13, R13D, R13W, R13B, 56 +DECLARE_REG 13, R14, R14D, R14W, R14B, 64 +DECLARE_REG 14, R15, R15D, R15W, R15B, 72 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... - ASSERT %2 >= %1 - ASSERT %2 <= 7 - LOAD_IF_USED 6, %1 + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS %4 %endmacro %macro RET 0 + POP_IF_USED 14, 13, 12, 11, 10, 9 ret %endmacro %macro REP_RET 0 - rep ret + %if regs_used > 9 + RET + %else + rep ret + %endif %endmacro %else ; X86_32 ;============================================================== -DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] -DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] -DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] -DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] -DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] -DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] -DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] -%define r7m [esp + stack_offset + 32] -%define r8m [esp + stack_offset + 36] +DECLARE_REG 0, eax, eax, ax, al, 4 +DECLARE_REG 1, ecx, ecx, cx, cl, 8 +DECLARE_REG 2, edx, edx, dx, dl, 12 +DECLARE_REG 3, ebx, ebx, bx, bl, 16 +DECLARE_REG 4, esi, esi, si, null, 20 +DECLARE_REG 5, edi, edi, di, null, 24 +DECLARE_REG 6, ebp, ebp, bp, null, 28 %define rsp esp -%macro PUSH_IF_USED 1 ; reg_id - %if %1 < regs_used - push r%1 - %assign stack_offset stack_offset+4 - %endif +%macro DECLARE_ARG 1- + %rep %0 + %define r%1m [esp + stack_offset + 4%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep %endmacro -%macro POP_IF_USED 1 ; reg_id - %if %1 < regs_used - pop r%1 - %endif -%endmacro - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [esp + stack_offset + 4 + %14] - %endif -%endmacro +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... - ASSERT %2 >= %1 + %assign num_args %1 %assign regs_used %2 - ASSERT regs_used <= 7 - PUSH_IF_USED 3 - PUSH_IF_USED 4 - PUSH_IF_USED 5 - PUSH_IF_USED 6 - LOAD_IF_USED 0, %1 - LOAD_IF_USED 1, %1 - LOAD_IF_USED 2, %1 - LOAD_IF_USED 3, %1 - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 + %if regs_used > 7 + %assign regs_used 7 + %endif + ASSERT regs_used >= num_args + PUSH_IF_USED 3, 4, 5, 6 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 DEFINE_ARGS %4 %endmacro %macro RET 0 - POP_IF_USED 6 - POP_IF_USED 5 - POP_IF_USED 4 - POP_IF_USED 3 + POP_IF_USED 6, 5, 4, 3 ret %endmacro @@ -454,8 +469,6 @@ %endmacro %endif - - ;============================================================================= ; arch-independent part ;============================================================================= @@ -784,16 +797,38 @@ %endrep %undef i +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + ;%1 == instruction ;%2 == 1 if float, 0 if int -;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm) +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) ;%4 == number of operands given ;%5+: operands %macro RUN_AVX_INSTR 6-7+ - %if sizeof%5==32 - v%1 %5, %6, %7 + %ifid %5 + %define %%sizeofreg sizeof%5 + %elifid %6 + %define %%sizeofreg sizeof%6 + %else + %define %%sizeofreg mmsize + %endif + %if %%sizeofreg==32 + %if %4>=3 + v%1 %5, %6, %7 + %else + v%1 %5, %6 + %endif %else - %if sizeof%5==8 + %if %%sizeofreg==8 %define %%regmov movq %elif %2 %define %%regmov movaps @@ -803,16 +838,17 @@ %if %4>=3+%3 %ifnidn %5, %6 - %if avx_enabled && sizeof%5==16 + %if avx_enabled && %%sizeofreg==16 v%1 %5, %6, %7 %else + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 %%regmov %5, %6 %1 %5, %7 %endif %else %1 %5, %7 %endif - %elif %3 + %elif %4>=3 %1 %5, %6, %7 %else %1 %5, %6 @@ -820,15 +856,37 @@ %endif %endmacro +; 3arg AVX ops with a memory arg can only have it in src2, +; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). +; So, if the op is symmetric and the wrong one is memory, swap them. +%macro RUN_AVX_INSTR1 8 + %assign %%swap 0 + %if avx_enabled + %ifnid %6 + %assign %%swap 1 + %endif + %elifnidn %5, %6 + %ifnid %7 + %assign %%swap 1 + %endif + %endif + %if %%swap && %3 == 0 && %8 == 1 + RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 + %else + RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 + %endif +%endmacro + ;%1 == instruction ;%2 == 1 if float, 0 if int -;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm) -%macro AVX_INSTR 3 - %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3 +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) +;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 4 + %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 %ifidn %3, fnord RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 %elifidn %4, fnord - RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3 + RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 %elifidn %5, fnord RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 %else @@ -837,158 +895,158 @@ %endmacro %endmacro -AVX_INSTR addpd, 1, 0 -AVX_INSTR addps, 1, 0 -AVX_INSTR addsd, 1, 0 -AVX_INSTR addss, 1, 0 -AVX_INSTR addsubpd, 1, 0 -AVX_INSTR addsubps, 1, 0 -AVX_INSTR andpd, 1, 0 -AVX_INSTR andps, 1, 0 -AVX_INSTR andnpd, 1, 0 -AVX_INSTR andnps, 1, 0 -AVX_INSTR blendpd, 1, 0 -AVX_INSTR blendps, 1, 0 -AVX_INSTR blendvpd, 1, 0 -AVX_INSTR blendvps, 1, 0 -AVX_INSTR cmppd, 1, 0 -AVX_INSTR cmpps, 1, 0 -AVX_INSTR cmpsd, 1, 0 -AVX_INSTR cmpss, 1, 0 -AVX_INSTR divpd, 1, 0 -AVX_INSTR divps, 1, 0 -AVX_INSTR divsd, 1, 0 -AVX_INSTR divss, 1, 0 -AVX_INSTR dppd, 1, 0 -AVX_INSTR dpps, 1, 0 -AVX_INSTR haddpd, 1, 0 -AVX_INSTR haddps, 1, 0 -AVX_INSTR hsubpd, 1, 0 -AVX_INSTR hsubps, 1, 0 -AVX_INSTR maxpd, 1, 0 -AVX_INSTR maxps, 1, 0 -AVX_INSTR maxsd, 1, 0 -AVX_INSTR maxss, 1, 0 -AVX_INSTR minpd, 1, 0 -AVX_INSTR minps, 1, 0 -AVX_INSTR minsd, 1, 0 -AVX_INSTR minss, 1, 0 -AVX_INSTR movsd, 1, 0 -AVX_INSTR movss, 1, 0 -AVX_INSTR mpsadbw, 0, 1 -AVX_INSTR mulpd, 1, 0 -AVX_INSTR mulps, 1, 0 -AVX_INSTR mulsd, 1, 0 -AVX_INSTR mulss, 1, 0 -AVX_INSTR orpd, 1, 0 -AVX_INSTR orps, 1, 0 -AVX_INSTR packsswb, 0, 0 -AVX_INSTR packssdw, 0, 0 -AVX_INSTR packuswb, 0, 0 -AVX_INSTR packusdw, 0, 0 -AVX_INSTR paddb, 0, 0 -AVX_INSTR paddw, 0, 0 -AVX_INSTR paddd, 0, 0 -AVX_INSTR paddq, 0, 0 -AVX_INSTR paddsb, 0, 0 -AVX_INSTR paddsw, 0, 0 -AVX_INSTR paddusb, 0, 0 -AVX_INSTR paddusw, 0, 0 -AVX_INSTR palignr, 0, 1 -AVX_INSTR pand, 0, 0 -AVX_INSTR pandn, 0, 0 -AVX_INSTR pavgb, 0, 0 -AVX_INSTR pavgw, 0, 0 -AVX_INSTR pblendvb, 0, 0 -AVX_INSTR pblendw, 0, 1 -AVX_INSTR pcmpestri, 0, 0 -AVX_INSTR pcmpestrm, 0, 0 -AVX_INSTR pcmpistri, 0, 0 -AVX_INSTR pcmpistrm, 0, 0 -AVX_INSTR pcmpeqb, 0, 0 -AVX_INSTR pcmpeqw, 0, 0 -AVX_INSTR pcmpeqd, 0, 0 -AVX_INSTR pcmpeqq, 0, 0 -AVX_INSTR pcmpgtb, 0, 0 -AVX_INSTR pcmpgtw, 0, 0 -AVX_INSTR pcmpgtd, 0, 0 -AVX_INSTR pcmpgtq, 0, 0 -AVX_INSTR phaddw, 0, 0 -AVX_INSTR phaddd, 0, 0 -AVX_INSTR phaddsw, 0, 0 -AVX_INSTR phsubw, 0, 0 -AVX_INSTR phsubd, 0, 0 -AVX_INSTR phsubsw, 0, 0 -AVX_INSTR pmaddwd, 0, 0 -AVX_INSTR pmaddubsw, 0, 0 -AVX_INSTR pmaxsb, 0, 0 -AVX_INSTR pmaxsw, 0, 0 -AVX_INSTR pmaxsd, 0, 0 -AVX_INSTR pmaxub, 0, 0 -AVX_INSTR pmaxuw, 0, 0 -AVX_INSTR pmaxud, 0, 0 -AVX_INSTR pminsb, 0, 0 -AVX_INSTR pminsw, 0, 0 -AVX_INSTR pminsd, 0, 0 -AVX_INSTR pminub, 0, 0 -AVX_INSTR pminuw, 0, 0 -AVX_INSTR pminud, 0, 0 -AVX_INSTR pmulhuw, 0, 0 -AVX_INSTR pmulhrsw, 0, 0 -AVX_INSTR pmulhw, 0, 0 -AVX_INSTR pmullw, 0, 0 -AVX_INSTR pmulld, 0, 0 -AVX_INSTR pmuludq, 0, 0 -AVX_INSTR pmuldq, 0, 0 -AVX_INSTR por, 0, 0 -AVX_INSTR psadbw, 0, 0 -AVX_INSTR pshufb, 0, 0 -AVX_INSTR psignb, 0, 0 -AVX_INSTR psignw, 0, 0 -AVX_INSTR psignd, 0, 0 -AVX_INSTR psllw, 0, 0 -AVX_INSTR pslld, 0, 0 -AVX_INSTR psllq, 0, 0 -AVX_INSTR pslldq, 0, 0 -AVX_INSTR psraw, 0, 0 -AVX_INSTR psrad, 0, 0 -AVX_INSTR psrlw, 0, 0 -AVX_INSTR psrld, 0, 0 -AVX_INSTR psrlq, 0, 0 -AVX_INSTR psrldq, 0, 0 -AVX_INSTR psubb, 0, 0 -AVX_INSTR psubw, 0, 0 -AVX_INSTR psubd, 0, 0 -AVX_INSTR psubq, 0, 0 -AVX_INSTR psubsb, 0, 0 -AVX_INSTR psubsw, 0, 0 -AVX_INSTR psubusb, 0, 0 -AVX_INSTR psubusw, 0, 0 -AVX_INSTR punpckhbw, 0, 0 -AVX_INSTR punpckhwd, 0, 0 -AVX_INSTR punpckhdq, 0, 0 -AVX_INSTR punpckhqdq, 0, 0 -AVX_INSTR punpcklbw, 0, 0 -AVX_INSTR punpcklwd, 0, 0 -AVX_INSTR punpckldq, 0, 0 -AVX_INSTR punpcklqdq, 0, 0 -AVX_INSTR pxor, 0, 0 -AVX_INSTR shufps, 0, 1 -AVX_INSTR subpd, 1, 0 -AVX_INSTR subps, 1, 0 -AVX_INSTR subsd, 1, 0 -AVX_INSTR subss, 1, 0 -AVX_INSTR unpckhpd, 1, 0 -AVX_INSTR unpckhps, 1, 0 -AVX_INSTR unpcklpd, 1, 0 -AVX_INSTR unpcklps, 1, 0 -AVX_INSTR xorpd, 1, 0 -AVX_INSTR xorps, 1, 0 +AVX_INSTR addpd, 1, 0, 1 +AVX_INSTR addps, 1, 0, 1 +AVX_INSTR addsd, 1, 0, 1 +AVX_INSTR addss, 1, 0, 1 +AVX_INSTR addsubpd, 1, 0, 0 +AVX_INSTR addsubps, 1, 0, 0 +AVX_INSTR andpd, 1, 0, 1 +AVX_INSTR andps, 1, 0, 1 +AVX_INSTR andnpd, 1, 0, 0 +AVX_INSTR andnps, 1, 0, 0 +AVX_INSTR blendpd, 1, 0, 0 +AVX_INSTR blendps, 1, 0, 0 +AVX_INSTR blendvpd, 1, 0, 0 +AVX_INSTR blendvps, 1, 0, 0 +AVX_INSTR cmppd, 1, 0, 0 +AVX_INSTR cmpps, 1, 0, 0 +AVX_INSTR cmpsd, 1, 0, 0 +AVX_INSTR cmpss, 1, 0, 0 +AVX_INSTR divpd, 1, 0, 0 +AVX_INSTR divps, 1, 0, 0 +AVX_INSTR divsd, 1, 0, 0 +AVX_INSTR divss, 1, 0, 0 +AVX_INSTR dppd, 1, 1, 0 +AVX_INSTR dpps, 1, 1, 0 +AVX_INSTR haddpd, 1, 0, 0 +AVX_INSTR haddps, 1, 0, 0 +AVX_INSTR hsubpd, 1, 0, 0 +AVX_INSTR hsubps, 1, 0, 0 +AVX_INSTR maxpd, 1, 0, 1 +AVX_INSTR maxps, 1, 0, 1 +AVX_INSTR maxsd, 1, 0, 1 +AVX_INSTR maxss, 1, 0, 1 +AVX_INSTR minpd, 1, 0, 1 +AVX_INSTR minps, 1, 0, 1 +AVX_INSTR minsd, 1, 0, 1 +AVX_INSTR minss, 1, 0, 1 +AVX_INSTR movsd, 1, 0, 0 +AVX_INSTR movss, 1, 0, 0 +AVX_INSTR mpsadbw, 0, 1, 0 +AVX_INSTR mulpd, 1, 0, 1 +AVX_INSTR mulps, 1, 0, 1 +AVX_INSTR mulsd, 1, 0, 1 +AVX_INSTR mulss, 1, 0, 1 +AVX_INSTR orpd, 1, 0, 1 +AVX_INSTR orps, 1, 0, 1 +AVX_INSTR packsswb, 0, 0, 0 +AVX_INSTR packssdw, 0, 0, 0 +AVX_INSTR packuswb, 0, 0, 0 +AVX_INSTR packusdw, 0, 0, 0 +AVX_INSTR paddb, 0, 0, 1 +AVX_INSTR paddw, 0, 0, 1 +AVX_INSTR paddd, 0, 0, 1 +AVX_INSTR paddq, 0, 0, 1 +AVX_INSTR paddsb, 0, 0, 1 +AVX_INSTR paddsw, 0, 0, 1 +AVX_INSTR paddusb, 0, 0, 1 +AVX_INSTR paddusw, 0, 0, 1 +AVX_INSTR palignr, 0, 1, 0 +AVX_INSTR pand, 0, 0, 1 +AVX_INSTR pandn, 0, 0, 0 +AVX_INSTR pavgb, 0, 0, 1 +AVX_INSTR pavgw, 0, 0, 1 +AVX_INSTR pblendvb, 0, 0, 0 +AVX_INSTR pblendw, 0, 1, 0 +AVX_INSTR pcmpestri, 0, 0, 0 +AVX_INSTR pcmpestrm, 0, 0, 0 +AVX_INSTR pcmpistri, 0, 0, 0 +AVX_INSTR pcmpistrm, 0, 0, 0 +AVX_INSTR pcmpeqb, 0, 0, 1 +AVX_INSTR pcmpeqw, 0, 0, 1 +AVX_INSTR pcmpeqd, 0, 0, 1 +AVX_INSTR pcmpeqq, 0, 0, 1 +AVX_INSTR pcmpgtb, 0, 0, 0 +AVX_INSTR pcmpgtw, 0, 0, 0 +AVX_INSTR pcmpgtd, 0, 0, 0 +AVX_INSTR pcmpgtq, 0, 0, 0 +AVX_INSTR phaddw, 0, 0, 0 +AVX_INSTR phaddd, 0, 0, 0 +AVX_INSTR phaddsw, 0, 0, 0 +AVX_INSTR phsubw, 0, 0, 0 +AVX_INSTR phsubd, 0, 0, 0 +AVX_INSTR phsubsw, 0, 0, 0 +AVX_INSTR pmaddwd, 0, 0, 1 +AVX_INSTR pmaddubsw, 0, 0, 0 +AVX_INSTR pmaxsb, 0, 0, 1 +AVX_INSTR pmaxsw, 0, 0, 1 +AVX_INSTR pmaxsd, 0, 0, 1 +AVX_INSTR pmaxub, 0, 0, 1 +AVX_INSTR pmaxuw, 0, 0, 1 +AVX_INSTR pmaxud, 0, 0, 1 +AVX_INSTR pminsb, 0, 0, 1 +AVX_INSTR pminsw, 0, 0, 1 +AVX_INSTR pminsd, 0, 0, 1 +AVX_INSTR pminub, 0, 0, 1 +AVX_INSTR pminuw, 0, 0, 1 +AVX_INSTR pminud, 0, 0, 1 +AVX_INSTR pmulhuw, 0, 0, 1 +AVX_INSTR pmulhrsw, 0, 0, 1 +AVX_INSTR pmulhw, 0, 0, 1 +AVX_INSTR pmullw, 0, 0, 1 +AVX_INSTR pmulld, 0, 0, 1 +AVX_INSTR pmuludq, 0, 0, 1 +AVX_INSTR pmuldq, 0, 0, 1 +AVX_INSTR por, 0, 0, 1 +AVX_INSTR psadbw, 0, 0, 1 +AVX_INSTR pshufb, 0, 0, 0 +AVX_INSTR psignb, 0, 0, 0 +AVX_INSTR psignw, 0, 0, 0 +AVX_INSTR psignd, 0, 0, 0 +AVX_INSTR psllw, 0, 0, 0 +AVX_INSTR pslld, 0, 0, 0 +AVX_INSTR psllq, 0, 0, 0 +AVX_INSTR pslldq, 0, 0, 0 +AVX_INSTR psraw, 0, 0, 0 +AVX_INSTR psrad, 0, 0, 0 +AVX_INSTR psrlw, 0, 0, 0 +AVX_INSTR psrld, 0, 0, 0 +AVX_INSTR psrlq, 0, 0, 0 +AVX_INSTR psrldq, 0, 0, 0 +AVX_INSTR psubb, 0, 0, 0 +AVX_INSTR psubw, 0, 0, 0 +AVX_INSTR psubd, 0, 0, 0 +AVX_INSTR psubq, 0, 0, 0 +AVX_INSTR psubsb, 0, 0, 0 +AVX_INSTR psubsw, 0, 0, 0 +AVX_INSTR psubusb, 0, 0, 0 +AVX_INSTR psubusw, 0, 0, 0 +AVX_INSTR punpckhbw, 0, 0, 0 +AVX_INSTR punpckhwd, 0, 0, 0 +AVX_INSTR punpckhdq, 0, 0, 0 +AVX_INSTR punpckhqdq, 0, 0, 0 +AVX_INSTR punpcklbw, 0, 0, 0 +AVX_INSTR punpcklwd, 0, 0, 0 +AVX_INSTR punpckldq, 0, 0, 0 +AVX_INSTR punpcklqdq, 0, 0, 0 +AVX_INSTR pxor, 0, 0, 1 +AVX_INSTR shufps, 1, 1, 0 +AVX_INSTR subpd, 1, 0, 0 +AVX_INSTR subps, 1, 0, 0 +AVX_INSTR subsd, 1, 0, 0 +AVX_INSTR subss, 1, 0, 0 +AVX_INSTR unpckhpd, 1, 0, 0 +AVX_INSTR unpckhps, 1, 0, 0 +AVX_INSTR unpcklpd, 1, 0, 0 +AVX_INSTR unpcklps, 1, 0, 0 +AVX_INSTR xorpd, 1, 0, 1 +AVX_INSTR xorps, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN -AVX_INSTR pfadd, 1, 0 -AVX_INSTR pfsub, 1, 0 -AVX_INSTR pfmul, 1, 0 +AVX_INSTR pfadd, 1, 0, 1 +AVX_INSTR pfsub, 1, 0, 0 +AVX_INSTR pfmul, 1, 0, 1 ; base-4 constants for shuffles %assign i 0
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/common/x86/x86util.asm ^
@@ -143,6 +143,17 @@ %endif %endmacro +%macro WIDEN_SXWD 2 + punpckhwd m%2, m%1 + psrad m%2, 16 +%if cpuflag(sse4) + pmovsxwd m%1, m%1 +%else + punpcklwd m%1, m%1 + psrad m%1, 16 +%endif +%endmacro + %macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src) %if cpuflag(ssse3) pabsw %1, %2 @@ -272,7 +283,7 @@ paddd %1, %2 %endmacro -%macro HADDW 2 +%macro HADDW 2 ; reg, tmp %if cpuflag(xop) && mmsize == 16 vphaddwq %1, %1 movhlps %2, %1 @@ -586,7 +597,10 @@ %endmacro %macro SUMSUB2_AB 4 -%ifnum %3 +%if cpuflag(xop) + pmacs%1%1 m%4, m%3, [p%1_m2], m%2 + pmacs%1%1 m%2, m%2, [p%1_2], m%3 +%elifnum %3 psub%1 m%4, m%2, m%3 psub%1 m%4, m%3 padd%1 m%2, m%2 @@ -600,22 +614,6 @@ %endif %endmacro -%macro SUMSUB2_BA 4 -%if avx_enabled - padd%1 m%4, m%2, m%3 - padd%1 m%4, m%3 - psub%1 m%3, m%2 - psub%1 m%3, m%2 - SWAP %2, %4 -%else - mova m%4, m%2 - padd%1 m%2, m%3 - padd%1 m%2, m%3 - psub%1 m%3, m%4 - psub%1 m%3, m%4 -%endif -%endmacro - %macro SUMSUBD2_AB 5 %ifnum %4 psra%1 m%5, m%2, 1 ; %3: %3>>1 @@ -697,7 +695,7 @@ %endmacro %macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr -%if cpuflag(ssse3) +%if BIT_DEPTH == 8 && cpuflag(ssse3) movh m%2, [%8+%1FDEC_STRIDE] movh m%1, [%7+%1FENC_STRIDE] punpcklbw m%1, m%2 @@ -715,10 +713,10 @@ pmaddubsw m%3, m%6 pmaddubsw m%4, m%6 %else - LOAD_DIFF m%1, m%5, m%6, [%7+%1FENC_STRIDE], [%8+%1FDEC_STRIDE] - LOAD_DIFF m%2, m%5, m%6, [%7+%2FENC_STRIDE], [%8+%2FDEC_STRIDE] - LOAD_DIFF m%3, m%5, m%6, [%7+%3FENC_STRIDE], [%8+%3FDEC_STRIDE] - LOAD_DIFF m%4, m%5, m%6, [%7+%4FENC_STRIDE], [%8+%4FDEC_STRIDE] + LOAD_DIFF m%1, m%5, m%6, [%7+%1FENC_STRIDEB], [%8+%1FDEC_STRIDEB] + LOAD_DIFF m%2, m%5, m%6, [%7+%2FENC_STRIDEB], [%8+%2FDEC_STRIDEB] + LOAD_DIFF m%3, m%5, m%6, [%7+%3FENC_STRIDEB], [%8+%3FDEC_STRIDEB] + LOAD_DIFF m%4, m%5, m%6, [%7+%4FENC_STRIDEB], [%8+%4FDEC_STRIDEB] %endif %endmacro @@ -767,13 +765,24 @@ packuswb %2, %1 %endmacro -%macro STORE_DIFF 4 +; (high depth) in: %1, %2, min to clip, max to clip, mem128 +; in: %1, tmp, %3, mem64 +%macro STORE_DIFF 4-5 +%ifdef HIGH_BIT_DEPTH + psrad %1, 6 + psrad %2, 6 + packssdw %1, %2 + paddw %1, %5 + CLIPW %1, %3, %4 + mova %5, %1 +%else movh %2, %4 punpcklbw %2, %3 psraw %1, 6 paddsw %1, %2 packuswb %1, %1 movh %4, %1 +%endif %endmacro %macro SHUFFLE_MASK_W 8 @@ -783,3 +792,12 @@ %rotate 1 %endrep %endmacro + +; instruction, accum, input, iteration (zero to swap, nonzero to add) +%macro ACCUM 4 +%if %4 + %1 m%2, m%3 +%else + SWAP %2, %3 +%endif +%endmacro
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/configure ^
@@ -212,6 +212,10 @@ rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest* +SRCPATH="$(cd $(dirname $0); pwd)" +[ "$SRCPATH" = "$(pwd)" ] && SRCPATH=. +[ -n "$(echo $SRCPATH \| grep ' ')" ] && die "Out of tree builds are impossible with whitespace in source path." + prefix='/usr/local' exec_prefix='${prefix}' bindir='${exec_prefix}/bin' @@ -241,7 +245,7 @@ chroma_format="all" compiler="GNU" -CFLAGS="$CFLAGS -Wall -I." +CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" LDFLAGS="$LDFLAGS" LDFLAGSCLI="$LDFLAGSCLI" ASFLAGS="$ASFLAGS" @@ -380,10 +384,10 @@ STRIP="${STRIP-${cross_prefix}strip}" if [ "x$host" = x ]; then - host=`./config.guess` + host=`${SRCPATH}/config.guess` fi # normalize a triplet into a quadruplet -host=`./config.sub $host` +host=`${SRCPATH}/config.sub $host` # split $host host_cpu="${host%%-}" @@ -588,7 +592,7 @@ s390\|s390x) ARCH="S390" ;; - parisc\|parisc64) + hppa\|parisc*) ARCH="PARISC" ;; ia64) @@ -626,12 +630,12 @@ fi fi -if [ $shared = yes -a $ $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" $ ] ; then +if [ $shared = yes -a $ $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" $ ] ; then pic="yes" fi if [ $asm = auto -a $ $ARCH = X86 -o $ARCH = X86_64 $ ] ; then - if ! as_check "vfmaddps xmm0, xmm0, xmm0, xmm0" ; then + if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then VER=`($AS --version \|\| echo no assembler) 2>/dev/null \| head -n 1` echo "Found $VER" echo "Minimum version is yasm-1.0.0" @@ -764,10 +768,10 @@ [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil" if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then - if cc_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "av_get_pix_fmt_name(0);" ; then + if cpp_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "defined(PIX_FMT_RGB)" ; then swscale="yes" else - echo "Warning: av_get_pix_fmt_name is missing from libavutil, update for swscale support" + echo "Warning: PIX_FMT_RGB is missing from libavutil, update for swscale support" fi fi fi @@ -795,7 +799,7 @@ fi if [ "$ffms" = "auto" ] ; then - ffms_major="2"; ffms_minor="14"; ffms_micro="0"; ffms_bump="0" + ffms_major="2"; ffms_minor="16"; ffms_micro="2"; ffms_bump="0" ffms="no" if ${cross_prefix}pkg-config --exists ffms2 2>/dev/null; then @@ -997,6 +1001,7 @@ # generate config files cat > config.mak << EOF +SRCPATH=$SRCPATH prefix=$prefix exec_prefix=$exec_prefix bindir=$bindir @@ -1036,7 +1041,7 @@ fi if [ "$shared" = "yes" ]; then - API=$(grep '#define X264_BUILD' < x264.h \| cut -f 3 -d ' ') + API=$(grep '#define X264_BUILD' < ${SRCPATH}/x264.h \| cut -f 3 -d ' ') if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then echo "SONAME=libx264-$API.dll" >> config.mak if [ $compiler = ICL ]; then @@ -1087,7 +1092,7 @@ echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak -./version.sh >> x264_config.h +${SRCPATH}/version.sh "${SRCPATH}" >> x264_config.h pclibs="-L$libdir -lx264 $libpthread" @@ -1139,6 +1144,9 @@ cat conftest.log rm conftest.log +[ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile +mkdir -p common/{arm,ppc,sparc,x86} encoder extras filters/video input output tools + echo echo "You can run 'make' or 'make fprofiled' now."
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/analyse.c ^
@@ -433,8 +433,10 @@ a->i_satd_i4x4 = a->i_satd_chroma = COST_MAX; - /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it / - a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COSTa->i_lambda2 + 128) >> 8 : COST_MAX; + /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it. + * PCM cost can overflow with high lambda2, so cap it at COST_MAX. / + uint64_t pcm_cost = ((uint64_t)X264_PCM_COSTa->i_lambda2 + 128) >> 8; + a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX; a->b_fast_intra = 0; a->b_avoid_topright = 0;
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/cabac.c ^
@@ -167,7 +167,12 @@ if( i_dqp != 0 ) { - int val = i_dqp <= 0 ? (-2i_dqp) : (2i_dqp - 1); + /* Faster than (i_dqp <= 0 ? (-2i_dqp) : (2i_dqp-1)). + * If you so much as sneeze on these lines, gcc will compile this suboptimally. / + i_dqp = 2; + int val = 1 - i_dqp; + if( val < 0 ) val = i_dqp; + val--; /* dqp is interpreted modulo (QP_MAX_SPEC+1) / if( val >= QP_MAX_SPEC && val != QP_MAX_SPEC+1 ) val = 2QP_MAX_SPEC+1 - val; @@ -289,8 +294,8 @@ x264_cabac_encode_decision( cb, ctxbase + 5, 1 ); if( i_abs < 9 ) { - cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]]; - cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]]; + cb->f8_bits_encoded += x264_cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]]; + cb->state[ctxbase+6] = x264_cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]]; } else { @@ -653,7 +658,12 @@ { 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766 }; -static const uint8_t significant_coeff_flag_offset_8x8[2][63] = +#if RDO_SKIP_BS +extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][63]; +extern const uint8_t x264_last_coeff_flag_offset_8x8[63]; +extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7]; +#else +const uint8_t x264_significant_coeff_flag_offset_8x8[2][63] = {{ 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, @@ -665,14 +675,15 @@ 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }}; -static const uint8_t last_coeff_flag_offset_8x8[63] = +const uint8_t x264_last_coeff_flag_offset_8x8[63] = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; -static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) / +const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; / MIN( i/2, 2 ) / +#endif // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). // 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). @@ -732,15 +743,15 @@ if( chroma422dc ) { int count_m1 = 7; - WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] ) + WRITE_SIGMAP( x264_coeff_flag_offset_chroma_422_dc[i], x264_coeff_flag_offset_chroma_422_dc[i] ) } else { int count_m1 = count_cat_m1[ctx_block_cat]; if( count_m1 == 63 ) { - const uint8_t sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; - WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] ) + const uint8_t sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; + WRITE_SIGMAP( sig_offset[i], x264_last_coeff_flag_offset_8x8[i] ) } else WRITE_SIGMAP( i, i ) @@ -794,7 +805,7 @@ is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. / static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t h, x264_cabac_t cb, int ctx_block_cat, dctcoef l, int b_8x8, int chroma422dc ) { - const uint8_t sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; + const uint8_t sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; @@ -807,9 +818,9 @@ if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) ) { x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : - chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 ); - x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : - chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 ); + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[last] : + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); } if( coeff_abs > 1 ) @@ -818,13 +829,13 @@ ctx = levelgt1_ctx[0] + ctx_level; if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { - cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; + cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]]; x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][0]; @@ -842,9 +853,9 @@ { coeff_abs = abs(l[i]); x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : - chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 ); - x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : - chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 ); + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[i] : + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 ); ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; if( coeff_abs > 1 ) @@ -853,13 +864,13 @@ ctx = levelgt1_ctx[node_ctx] + ctx_level; if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { - cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; + cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]]; x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][node_ctx]; @@ -873,7 +884,7 @@ } else x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : - chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 ); + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 ); } }
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/cavlc.c ^
@@ -132,6 +132,7 @@ runlevel.level[1] = 2; runlevel.level[2] = 2; i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel ); + x264_prefetch( &x264_run_before[runlevel.mask] ); i_total_zero = runlevel.last + 1 - i_total; i_trailing = ((((runlevel.level[0]+1) \| (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1 @@ -188,12 +189,8 @@ else if( (uint8_t)i_total < count_cat[ctx_block_cat] ) bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] ); - for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ ) - { - int i_zl = X264_MIN( i_total_zero, 7 ); - bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] ); - i_total_zero -= runlevel.run[i]; - } + int zero_run_code = x264_run_before[runlevel.mask]; + bs_write( s, zero_run_code&0x1f, zero_run_code>>5 ); return i_total; }
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/encoder.c ^
@@ -61,7 +61,11 @@ static double x264_ssim( double ssim ) { - return -10.0 * log10( 1 - ssim ); + double inv_ssim = 1 - ssim; + if( inv_ssim <= 0.0000000001 ) /* Max 100dB / + return 100; + + return -10.0 log10( inv_ssim ); } static void x264_frame_dump( x264_t h ) @@ -472,7 +476,6 @@ if( h->param.i_threads == X264_THREADS_AUTO ) h->param.i_threads = x264_cpu_num_processors() (h->param.b_sliced_threads?2:3)/2; - h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); if( h->param.i_threads > 1 ) { #if !HAVE_THREAD @@ -487,7 +490,8 @@ h->param.i_threads = X264_MIN( h->param.i_threads, max_threads ); } } - else + h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); + if( h->param.i_threads == 1 ) h->param.b_sliced_threads = 0; h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads; if( h->i_thread_frames > 1 ) @@ -1169,10 +1173,6 @@ x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); - if( h->param.b_cabac ) - x264_cabac_init( h ); - else - x264_cavlc_init(); x264_pixel_init( h->param.cpu, &h->pixf ); x264_dct_init( h->param.cpu, &h->dctf ); x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced ); @@ -1181,7 +1181,10 @@ x264_quant_init( h, h->param.cpu, &h->quantf ); x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED ); x264_bitstream_init( h->param.cpu, &h->bsf ); - x264_dct_init_weights(); + if( h->param.b_cabac ) + x264_cabac_init( h ); + else + x264_cavlc_init( h ); mbcmp_init( h ); chroma_dsp_init( h ); @@ -3108,6 +3111,8 @@ if( pic_out->i_pts < pic_out->i_dts ) x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" ); + pic_out->opaque = h->fenc->opaque; + pic_out->img.i_csp = h->fdec->i_csp; #if HIGH_BIT_DEPTH pic_out->img.i_csp \|= X264_CSP_HIGH_DEPTH;
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/macroblock.c ^
@@ -1090,7 +1090,7 @@ { int dct8x8 = cat&1; int size = dct8x8 ? 64 : 16; - const uint16_t weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; + const uint32_t weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) ) {
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/ratecontrol.c ^
@@ -2385,7 +2385,7 @@ } } -void x264_threads_normalize_predictors( x264_t h ) +static void x264_threads_normalize_predictors( x264_t h ) { double totalsize = 0; for( int i = 0; i < h->param.i_threads; i++ )
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/rdo.c ^
@@ -32,8 +32,8 @@ /* Transition and size tables for abs<9 MVD and residual coding / / Consist of i_prefix-2 1s, one zero, and a bypass sign bit / -static uint8_t cabac_transition_unary[15][128]; -static uint16_t cabac_size_unary[15][128]; +uint8_t x264_cabac_transition_unary[15][128]; +uint16_t x264_cabac_size_unary[15][128]; / Transition and size tables for abs>9 MVD / / Consist of 5 1s and a bypass sign bit / static uint8_t cabac_transition_5ones[128]; @@ -365,9 +365,9 @@ Trellis RD quantization ***************************************************************************/ -#define TRELLIS_SCORE_MAX ((uint64_t)1<<50) +#define TRELLIS_SCORE_MAX -1LL // negative marks the node as invalid +#define TRELLIS_SCORE_BIAS 1LL<<60; // bias so that all valid scores are positive, even after negative contributions from psy #define CABAC_SIZE_BITS 8 -#define SSD_WEIGHT_BITS 5 #define LAMBDA_BITS 4 / precalculate the cost of coding various combinations of bits in a single context / @@ -386,8 +386,8 @@ f8_bits += x264_cabac_size_decision2( &ctx, 0 ); f8_bits += 1 << CABAC_SIZE_BITS; //sign - cabac_size_unary[i_prefix][i_ctx] = f8_bits; - cabac_transition_unary[i_prefix][i_ctx] = ctx; + x264_cabac_size_unary[i_prefix][i_ctx] = f8_bits; + x264_cabac_transition_unary[i_prefix][i_ctx] = ctx; } } for( int i_ctx = 0; i_ctx < 128; i_ctx++ ) @@ -406,11 +406,17 @@ typedef struct { - int64_t score; + uint64_t score; int level_idx; // index into level_tree[] - uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1 + uint8_t cabac_state[4]; // just contexts 0,4,8,9 of the 10 relevant to coding abs_level_m1 } trellis_node_t; +typedef struct +{ + uint16_t next; + uint16_t abs_level; +} trellis_level_t; + // TODO: // save cabac state between blocks? // use trellis' RD score instead of x264_mb_decimate_score? @@ -431,68 +437,307 @@ // comparable to the input. so unquant is the direct inverse of quant, // and uses the dct scaling factors, not the idct ones. +#define SIGN(x,y) ((x^(y >> 31))-(y >> 31)) + +#define SET_LEVEL(ndst, nsrc, l) {\ + if( sizeof(trellis_level_t) == sizeof(uint32_t) )\ + M32( &level_tree[levels_used] ) = pack16to32( nsrc.level_idx, l );\ + else\ + level_tree[levels_used] = (trellis_level_t){ nsrc.level_idx, l };\ + ndst.level_idx = levels_used;\ + levels_used++;\ +} + +// encode all values of the dc coef in a block which is known to have no ac +static NOINLINE +int trellis_dc_shortcut( int sign_coef, int quant_coef, int unquant_mf, int coef_weight, int lambda2, uint8_t cabac_state, int cost_sig ) +{ + uint64_t bscore = TRELLIS_SCORE_MAX; + int ret = 0; + int q = abs( quant_coef ); + for( int abs_level = q-1; abs_level <= q; abs_level++ ) + { + int unquant_abs_level = (unquant_mf * abs_level + 128) >> 8; + + /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. / + int d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15); + uint64_t score = (uint64_t)dd * coef_weight; + + /* code the proposed level, and count how much entropy it would take / + if( abs_level ) + { + unsigned f8_bits = cost_sig; + int prefix = X264_MIN( abs_level - 1, 14 ); + f8_bits += x264_cabac_size_decision_noup2( cabac_state+1, prefix > 0 ); + f8_bits += x264_cabac_size_unary[prefix][cabac_state[5]]; + if( abs_level >= 15 ) + f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS; + score += (uint64_t)f8_bits lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); + } + + COPY2_IF_LT( bscore, score, ret, abs_level ); + } + return SIGN(ret, sign_coef); +} + +// encode one value of one coef in one context +static ALWAYS_INLINE +int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_cost, + int node_ctx, int level1_ctx, int levelgt1_ctx, uint64_t ssd, int cost_siglast[3], + trellis_node_t nodes_cur, trellis_node_t nodes_prev, + trellis_level_t level_tree, int levels_used, int lambda2, uint8_t level_state ) +{ + uint64_t score = nodes_prev[j].score + ssd; + /* code the proposed level, and count how much entropy it would take / + unsigned f8_bits = cost_siglast[ j ? 1 : 2 ]; + uint8_t level1_state = (j >= 3) ? nodes_prev[j].cabac_state[level1_ctx>>2] : level_state[level1_ctx]; + f8_bits += x264_cabac_entropy[level1_state ^ (const_level > 1)]; + uint8_t levelgt1_state; + if( const_level > 1 ) + { + levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx]; + f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost; + } + else + f8_bits += 1 << CABAC_SIZE_BITS; + score += (uint64_t)f8_bits lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); + + /* save the node if it's better than any existing node with the same cabac ctx / + if( score < nodes_cur[node_ctx].score ) + { + nodes_cur[node_ctx].score = score; + if( j == 2 \|\| (j <= 3 && node_ctx == 4) ) // init from input state + M32(nodes_cur[node_ctx].cabac_state) = M32(level_state+12); + else if( j >= 3 ) + M32(nodes_cur[node_ctx].cabac_state) = M32(nodes_prev[j].cabac_state); + if( j >= 3 ) // skip the transition if we're not going to reuse the context + nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1]; + if( const_level > 1 && node_ctx == 7 ) + nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state]; + nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx; + SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level ); + } + return levels_used; +} + +// encode one value of one coef in all contexts, templated by which value that is. +// in ctx_lo, the set of live nodes is contiguous and starts at ctx0, so return as soon as we've seen one failure. +// in ctx_hi, they're contiguous within each block of 4 ctxs, but not necessarily starting at the beginning, +// so exploiting that would be more complicated. +static NOINLINE +int trellis_coef0_0( uint64_t ssd0, trellis_node_t nodes_cur, trellis_node_t nodes_prev, + trellis_level_t level_tree, int levels_used ) +{ + nodes_cur[0].score = nodes_prev[0].score + ssd0; + nodes_cur[0].level_idx = nodes_prev[0].level_idx; + for( int j = 1; j < 4 && (int64_t)nodes_prev[j].score >= 0; j++ ) + { + nodes_cur[j].score = nodes_prev[j].score; + if( j >= 3 ) + M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state); + SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 ); + } + return levels_used; +} + +static NOINLINE +int trellis_coef0_1( uint64_t ssd0, trellis_node_t nodes_cur, trellis_node_t nodes_prev, + trellis_level_t level_tree, int levels_used ) +{ + for( int j = 1; j < 8; j++ ) + // this branch only affects speed, not function; there's nothing wrong with updating invalid nodes in coef0. + if( (int64_t)nodes_prev[j].score >= 0 ) + { + nodes_cur[j].score = nodes_prev[j].score; + if( j >= 3 ) + M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state); + SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 ); + } + return levels_used; +} + +#define COEF(const_level, ctx_hi, j, ...)\ + if( !j \|\| (int64_t)nodes_prev[j].score >= 0 )\ + levels_used = trellis_coef( j, const_level, abs_level, prefix, suffix_cost, __VA_ARGS__,\ + j?ssd1:ssd0, cost_siglast, nodes_cur, nodes_prev,\ + level_tree, levels_used, lambda2, level_state );\ + else if( !ctx_hi )\ + return levels_used; + +static NOINLINE +int trellis_coef1_0( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], + trellis_node_t nodes_cur, trellis_node_t nodes_prev, + trellis_level_t level_tree, int levels_used, int lambda2, + uint8_t level_state ) +{ + int abs_level = 1, prefix = 1, suffix_cost = 0; + COEF( 1, 0, 0, 1, 1, 0 ); + COEF( 1, 0, 1, 2, 2, 0 ); + COEF( 1, 0, 2, 3, 3, 0 ); + COEF( 1, 0, 3, 3, 4, 0 ); + return levels_used; +} + +static NOINLINE +int trellis_coef1_1( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], + trellis_node_t nodes_cur, trellis_node_t nodes_prev, + trellis_level_t level_tree, int levels_used, int lambda2, + uint8_t level_state ) +{ + int abs_level = 1, prefix = 1, suffix_cost = 0; + COEF( 1, 1, 1, 2, 2, 0 ); + COEF( 1, 1, 2, 3, 3, 0 ); + COEF( 1, 1, 3, 3, 4, 0 ); + COEF( 1, 1, 4, 4, 0, 0 ); + COEF( 1, 1, 5, 5, 0, 0 ); + COEF( 1, 1, 6, 6, 0, 0 ); + COEF( 1, 1, 7, 7, 0, 0 ); + return levels_used; +} + +static NOINLINE +int trellis_coefn_0( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], + trellis_node_t nodes_cur, trellis_node_t nodes_prev, + trellis_level_t level_tree, int levels_used, int lambda2, + uint8_t level_state, int levelgt1_ctx ) +{ + int prefix = X264_MIN( abs_level-1, 14 ); + int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0; + COEF( 2, 0, 0, 4, 1, 5 ); + COEF( 2, 0, 1, 4, 2, 5 ); + COEF( 2, 0, 2, 4, 3, 5 ); + COEF( 2, 0, 3, 4, 4, 5 ); + return levels_used; +} + +static NOINLINE +int trellis_coefn_1( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], + trellis_node_t nodes_cur, trellis_node_t nodes_prev, + trellis_level_t level_tree, int levels_used, int lambda2, + uint8_t level_state, int levelgt1_ctx ) +{ + int prefix = X264_MIN( abs_level-1, 14 ); + int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0; + COEF( 2, 1, 1, 4, 2, 5 ); + COEF( 2, 1, 2, 4, 3, 5 ); + COEF( 2, 1, 3, 4, 4, 5 ); + COEF( 2, 1, 4, 5, 0, 6 ); + COEF( 2, 1, 5, 6, 0, 7 ); + COEF( 2, 1, 6, 7, 0, 8 ); + COEF( 2, 1, 7, 7, 0, levelgt1_ctx ); + return levels_used; +} + static ALWAYS_INLINE int quant_trellis_cabac( x264_t h, dctcoef dct, - const udctcoef quant_mf, const int unquant_mf, - const uint16_t coef_weight, const uint8_t zigzag, - int ctx_block_cat, int i_lambda2, int b_ac, - int b_chroma, int dc, int i_coefs, int idx ) + udctcoef quant_mf, udctcoef quant_bias, const int unquant_mf, + const uint8_t zigzag, int ctx_block_cat, int lambda2, int b_ac, + int b_chroma, int dc, int num_coefs, int idx ) { - udctcoef abs_coefs[64]; - int8_t signs[64]; - trellis_node_t nodes[2][8]; - trellis_node_t nodes_cur = nodes[0]; - trellis_node_t nodes_prev = nodes[1]; - trellis_node_t bnode; + ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] ); + ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] ); + const uint32_t coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; + const uint32_t coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; uint8_t cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; uint8_t cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; - const uint8_t levelgt1_ctx = b_chroma && dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; - const int f = 1 << 15; // no deadzone - int i_last_nnz; - int i; + int levelgt1_ctx = b_chroma && dc ? 8 : 9; - // (# of coefs) (# of ctx) * (# of levels tried) = 1024 - // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough, - // but it takes more time to remove dead states than you gain in reduced memory. - struct + if( dc ) { - uint16_t abs_level; - uint16_t next; - } level_tree[6482]; - int i_levels_used = 1; - - /* init coefs / - for( i = i_coefs-1; i >= b_ac; i-- ) - if( (unsigned)(dct[zigzag[i]] (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2f ) - break; - - if( i < b_ac ) + if( num_coefs == 16 ) + { + memcpy( orig_coefs, dct, sizeof(dctcoef)16 ); + if( !h->quantf.quant_4x4_dc( dct, quant_mf[0] >> 1, quant_bias[0] << 1 ) ) + return 0; + h->zigzagf.scan_4x4( quant_coefs, dct ); + } + else + { + memcpy( orig_coefs, dct, sizeof(dctcoef)num_coefs ); + int nz = h->quantf.quant_2x2_dc( &dct[0], quant_mf[0] >> 1, quant_bias[0] << 1 ); + if( num_coefs == 8 ) + nz \|= h->quantf.quant_2x2_dc( &dct[4], quant_mf[0] >> 1, quant_bias[0] << 1 ); + if( !nz ) + return 0; + for( int i = 0; i < num_coefs; i++ ) + quant_coefs[i] = dct[zigzag[i]]; + } + } + else { - / We only need to zero an empty 4x4 block. 8x8 can be - implicitly emptied via zero nnz, as can dc. / - if( i_coefs == 16 && !dc ) - memset( dct, 0, 16 sizeof(dctcoef) ); - return 0; + if( num_coefs == 64 ) + { + h->mc.memcpy_aligned( orig_coefs, dct, sizeof(dctcoef)64 ); + if( !h->quantf.quant_8x8( dct, quant_mf, quant_bias ) ) + return 0; + h->zigzagf.scan_8x8( quant_coefs, dct ); + } + else //if( num_coefs == 16 ) + { + memcpy( orig_coefs, dct, sizeof(dctcoef)16 ); + if( !h->quantf.quant_4x4( dct, quant_mf, quant_bias ) ) + return 0; + h->zigzagf.scan_4x4( quant_coefs, dct ); + } } - i_last_nnz = i; - idx &= i_coefs == 64 ? 3 : 15; + int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac; + uint8_t cabac_state = &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ]; - for( ; i >= b_ac; i-- ) - { - int coef = dct[zigzag[i]]; - abs_coefs[i] = abs(coef); - signs[i] = coef>>31 \| 1; - } + / shortcut for dc-only blocks. + * this doesn't affect the output, but saves some unnecessary computation. / + if( last_nnz == 0 && !dc ) + { + int cost_sig = x264_cabac_size_decision_noup2( &cabac_state_sig[0], 1 ) + + x264_cabac_size_decision_noup2( &cabac_state_last[0], 1 ); + dct[0] = trellis_dc_shortcut( orig_coefs[0], quant_coefs[0], unquant_mf[0], coef_weight2[0], lambda2, cabac_state, cost_sig ); + return !!dct[0]; + } + +#if HAVE_MMX && ARCH_X86_64 +#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\ + cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8) + if( num_coefs == 16 && !dc ) + if( b_chroma \|\| !h->mb.i_psy_trellis ) + return h->quantf.trellis_cabac_4x4( TRELLIS_ARGS, b_ac ); + else + return h->quantf.trellis_cabac_4x4_psy( TRELLIS_ARGS, b_ac, h->mb.pic.fenc_dct4[idx&15], h->mb.i_psy_trellis ); + else if( num_coefs == 64 && !dc ) + if( b_chroma \|\| !h->mb.i_psy_trellis ) + return h->quantf.trellis_cabac_8x8( TRELLIS_ARGS, b_interlaced ); + else + return h->quantf.trellis_cabac_8x8_psy( TRELLIS_ARGS, b_interlaced, h->mb.pic.fenc_dct8[idx&3], h->mb.i_psy_trellis); + else if( num_coefs == 8 && dc ) + return h->quantf.trellis_cabac_chroma_422_dc( TRELLIS_ARGS ); + else if( dc ) + return h->quantf.trellis_cabac_dc( TRELLIS_ARGS, num_coefs-1 ); +#endif + // (# of coefs) (# of ctx) * (# of levels tried) = 1024 + // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough, + // but it takes more time to remove dead states than you gain in reduced memory. + trellis_level_t level_tree[6482]; + int levels_used = 1; /* init trellis / - for( int j = 1; j < 8; j++ ) + trellis_node_t nodes[2][8]; + trellis_node_t nodes_cur = nodes[0]; + trellis_node_t nodes_prev = nodes[1]; + trellis_node_t bnode; + for( int j = 1; j < 4; j++ ) nodes_cur[j].score = TRELLIS_SCORE_MAX; - nodes_cur[0].score = 0; + nodes_cur[0].score = TRELLIS_SCORE_BIAS; nodes_cur[0].level_idx = 0; level_tree[0].abs_level = 0; level_tree[0].next = 0; + ALIGNED_4( uint8_t level_state[16] ); + memcpy( level_state, cabac_state, 10 ); + level_state[12] = cabac_state[0]; // packed subset for copying into trellis_node_t + level_state[13] = cabac_state[4]; + level_state[14] = cabac_state[8]; + level_state[15] = cabac_state[9]; + + idx &= num_coefs == 64 ? 3 : 15; // coefs are processed in reverse order, because that's how the abs value is coded. // last_coef and significant_coef flags are normally coded in forward order, but @@ -501,160 +746,135 @@ // position, so the order doesn't matter, and we don't even have to update their contexts. // in 8x8 blocks, some positions share contexts, so we'll just have to hope that // cabac isn't too sensitive. - - memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ], 10 ); - - for( i = i_last_nnz; i >= b_ac; i-- ) - { - int i_coef = abs_coefs[i]; - int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16; - int cost_sig[2], cost_last[2]; - trellis_node_t n; - - // skip 0s: this doesn't affect the output, but saves some unnecessary computation. - if( q == 0 ) - { - // no need to calculate ssd of 0s: it's the same in all nodes. - // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s. - int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : - b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; - const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ) - * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); - for( int j = 1; j < 8; j++ ) - { - if( nodes_cur[j].score != TRELLIS_SCORE_MAX ) - { -#define SET_LEVEL(n,l) \ - level_tree[i_levels_used].abs_level = l; \ - level_tree[i_levels_used].next = n.level_idx; \ - n.level_idx = i_levels_used; \ - i_levels_used++; - - SET_LEVEL( nodes_cur[j], 0 ); - nodes_cur[j].score += cost_sig0; - } - } - continue; - } - - XCHG( trellis_node_t, nodes_cur, nodes_prev ); - - for( int j = 0; j < 8; j++ ) - nodes_cur[j].score = TRELLIS_SCORE_MAX; - - if( i < i_coefs-1 ) - { - int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : - b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; - int lastindex = !dc && i_coefs == 64 ? last_coeff_flag_offset_8x8[i] : - b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; - cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ); - cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 ); - cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ); - cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ); - } - else - { - cost_sig[0] = cost_sig[1] = 0; - cost_last[0] = cost_last[1] = 0; - } - - // there are a few cases where increasing the coeff magnitude helps, - // but it's only around .003 dB, and skipping them ~doubles the speed of trellis. - // could also try q-2: that sometimes helps, but also sometimes decimates blocks - // that are better left coded, especially at QP > 40. - for( int abs_level = q; abs_level >= q-1; abs_level-- ) - { - int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) abs_level + 128) >> 8); - int d = i_coef - unquant_abs_level; - int64_t ssd; - /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. / - if( h->mb.i_psy_trellis && i && !dc && !b_chroma ) - { - int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]]; - int predicted_coef = orig_coef - i_coef signs[i]; - int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]); - int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]]; - ssd = (int64_t)dd coef_weight[i] - psy_weight * psy_value; - } - else - /* FIXME: for i16x16 dc is this weight optimal? / - ssd = (int64_t)dd * (dc?256:coef_weight[i]); - - for( int j = 0; j < 8; j++ ) - { - int node_ctx = j; - if( nodes_prev[j].score == TRELLIS_SCORE_MAX ) - continue; - n = nodes_prev[j]; - - /* code the proposed level, and count how much entropy it would take / - if( abs_level \|\| node_ctx ) - { - unsigned f8_bits = cost_sig[ abs_level != 0 ]; - if( abs_level ) - { - const int i_prefix = X264_MIN( abs_level - 1, 14 ); - f8_bits += cost_last[ node_ctx == 0 ]; - f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 ); - if( i_prefix > 0 ) - { - uint8_t ctx = &n.cabac_state[levelgt1_ctx[node_ctx]]; - f8_bits += cabac_size_unary[i_prefix][ctx]; - ctx = cabac_transition_unary[i_prefix][ctx]; - if( abs_level >= 15 ) - f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS; - node_ctx = coeff_abs_level_transition[1][node_ctx]; - } - else - { - f8_bits += 1 << CABAC_SIZE_BITS; - node_ctx = coeff_abs_level_transition[0][node_ctx]; - } - } - n.score += (uint64_t)f8_bits i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); - } - - if( j \|\| i \|\| dc ) - n.score += ssd; - /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. / - else - { - d = i_coef signs[0] - ((unquant_abs_level * signs[0] + 8)&~15); - n.score += (int64_t)dd coef_weight[i]; - } - - /* save the node if it's better than any existing node with the same cabac ctx / - if( n.score < nodes_cur[node_ctx].score ) - { - SET_LEVEL( n, abs_level ); - nodes_cur[node_ctx] = n; - } - } - } - } - - / output levels from the best path through the trellis / - bnode = &nodes_cur[0]; - for( int j = 1; j < 8; j++ ) - if( nodes_cur[j].score < bnode->score ) + int i = last_nnz; +#define TRELLIS_LOOP(ctx_hi)\ + for( ; i >= b_ac; i-- )\ + {\ + / skip 0s: this doesn't affect the output, but saves some unnecessary computation. /\ + if( !quant_coefs[i] )\ + {\ + / no need to calculate ssd of 0s: it's the same in all nodes.\ + * no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s. + * subtracting from one score is equivalent to adding to the rest. /\ + if( !ctx_hi )\ + {\ + int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\ + b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\ + uint64_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )\ + (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\ + nodes_cur[0].score -= cost_sig0;\ + }\ + for( int j = 1; j < (ctx_hi?8:4); j++ )\ + SET_LEVEL( nodes_cur[j], nodes_cur[j], 0 );\ + continue;\ + }\ +\ + int sign_coef = orig_coefs[zigzag[i]];\ + int abs_coef = abs( sign_coef );\ + int q = abs( quant_coefs[i] );\ + int cost_siglast[3]; /* { zero, nonzero, nonzero-and-last } /\ + XCHG( trellis_node_t, nodes_cur, nodes_prev );\ + for( int j = ctx_hi; j < 8; j++ )\ + nodes_cur[j].score = TRELLIS_SCORE_MAX;\ +\ + if( i < num_coefs-1 \|\| ctx_hi )\ + {\ + int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\ + b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\ + int lastindex = !dc && num_coefs == 64 ? x264_last_coeff_flag_offset_8x8[i] :\ + b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\ + cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );\ + int cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );\ + cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;\ + if( !ctx_hi )\ + cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;\ + }\ + else\ + {\ + cost_siglast[0] = cost_siglast[1] = cost_siglast[2] = 0;\ + }\ +\ + /* there are a few cases where increasing the coeff magnitude helps,\ + * but it's only around .003 dB, and skipping them ~doubles the speed of trellis.\ + * could also try q-2: that sometimes helps, but also sometimes decimates blocks\ + * that are better left coded, especially at QP > 40. /\ + uint64_t ssd0[2], ssd1[2];\ + for( int k = 0; k < 2; k++ )\ + {\ + int abs_level = q-1+k;\ + int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) abs_level + 128) >> 8);\ + int d = abs_coef - unquant_abs_level;\ + /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. /\ + if( h->mb.i_psy_trellis && i && !dc && !b_chroma )\ + {\ + int orig_coef = (num_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];\ + int predicted_coef = orig_coef - sign_coef;\ + int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));\ + int psy_weight = coef_weight1[zigzag[i]] h->mb.i_psy_trellis;\ + ssd1[k] = (uint64_t)dd coef_weight2[zigzag[i]] - psy_weight * psy_value;\ + }\ + else\ + /* FIXME: for i16x16 dc is this weight optimal? /\ + ssd1[k] = (uint64_t)dd * (dc?256:coef_weight2[zigzag[i]]);\ + ssd0[k] = ssd1[k];\ + if( !i && !dc && !ctx_hi )\ + {\ + /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. /\ + d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);\ + ssd0[k] = (uint64_t)dd * coef_weight2[zigzag[i]];\ + }\ + }\ +\ + /* argument passing imposes some significant overhead here. gcc's interprocedural register allocation isn't up to it. /\ + switch( q )\ + {\ + case 1:\ + ssd1[0] += (uint64_t)cost_siglast[0] lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\ + levels_used = trellis_coef0_##ctx_hi( ssd0[0]-ssd1[0], nodes_cur, nodes_prev, level_tree, levels_used );\ + levels_used = trellis_coef1_##ctx_hi( ssd0[1]-ssd1[0], ssd1[1]-ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\ + goto next##ctx_hi;\ + case 2:\ + levels_used = trellis_coef1_##ctx_hi( ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\ + levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\ + goto next1;\ + default:\ + levels_used = trellis_coefn_##ctx_hi( q-1, ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\ + levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\ + goto next1;\ + }\ + next##ctx_hi:;\ + }\ + /* output levels from the best path through the trellis /\ + bnode = &nodes_cur[ctx_hi];\ + for( int j = ctx_hi+1; j < (ctx_hi?8:4); j++ )\ + if( nodes_cur[j].score < bnode->score )\ bnode = &nodes_cur[j]; + // keep 2 versions of the main quantization loop, depending on which subsets of the node_ctxs are live + // node_ctx 0..3, i.e. having not yet encountered any coefs that might be quantized to >1 + TRELLIS_LOOP(0); + if( bnode == &nodes_cur[0] ) { - if( i_coefs == 16 && !dc ) + / We only need to zero an empty 4x4 block. 8x8 can be + implicitly emptied via zero nnz, as can dc. / + if( num_coefs == 16 && !dc ) memset( dct, 0, 16 sizeof(dctcoef) ); return 0; } + if(0) // accessible only by goto, not fallthrough + { + // node_ctx 1..7 (ctx0 ruled out because we never try both level0 and level2+ on the same coef) + TRELLIS_LOOP(1); + } + int level = bnode->level_idx; - for( i = b_ac; level; i++ ) + for( i = b_ac; i <= last_nnz; i++ ) { - dct[zigzag[i]] = level_tree[level].abs_level * signs[i]; + dct[zigzag[i]] = SIGN(level_tree[level].abs_level, dct[zigzag[i]]); level = level_tree[level].next; } - for( ; i < i_coefs; i++ ) - dct[zigzag[i]] = 0; return 1; } @@ -685,24 +905,25 @@ static ALWAYS_INLINE int quant_trellis_cavlc( x264_t h, dctcoef dct, const udctcoef quant_mf, const int unquant_mf, - const uint16_t coef_weight, const uint8_t zigzag, - int ctx_block_cat, int i_lambda2, int b_ac, - int b_chroma, int dc, int i_coefs, int idx, int b_8x8 ) + const uint8_t zigzag, int ctx_block_cat, int lambda2, int b_ac, + int b_chroma, int dc, int num_coefs, int idx, int b_8x8 ) { ALIGNED_16( dctcoef quant_coefs[2][16] ); ALIGNED_16( dctcoef coefs[16] ) = {0}; + const uint32_t coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab; + const uint32_t coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; int delta_distortion[16]; int64_t score = 1ULL<<62; int i, j; const int f = 1<<15; - int nC = b_chroma && dc ? 3 + (i_coefs>>2) + int nC = b_chroma && dc ? 3 + (num_coefs>>2) : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)16 : idx )]; /* Code for handling 8x8dct -> 4x4dct CAVLC munging. Input/output use a different * step/start/end than internal processing. / int step = 1; int start = b_ac; - int end = i_coefs - 1; + int end = num_coefs - 1; if( b_8x8 ) { start = idx&3; @@ -711,7 +932,7 @@ } idx &= 15; - i_lambda2 <<= LAMBDA_BITS; + lambda2 <<= LAMBDA_BITS; / Find last non-zero coefficient. / for( i = end; i >= start; i -= step ) @@ -726,10 +947,10 @@ * We only search two roundings (nearest and nearest-1) like in CABAC trellis, * so we just store the difference in distortion between them. / - int i_last_nnz = b_8x8 ? i >> 2 : i; + int last_nnz = b_8x8 ? i >> 2 : i; int coef_mask = 0; int round_mask = 0; - for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step ) + for( i = b_ac, j = start; i <= last_nnz; i++, j += step ) { int coef = dct[zigzag[j]]; int abs_coef = abs(coef); @@ -748,14 +969,14 @@ int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) (nearest_quant-1) + 128) >> 8); int d1 = abs_coef - unquant1; int d0 = abs_coef - unquant0; - delta_distortion[i] = (d0d0 - d1d1) * (dc?256:coef_weight[j]); + delta_distortion[i] = (d0d0 - d1d1) * (dc?256:coef_weight2[zigzag[j]]); /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. / if( h->mb.i_psy_trellis && j && !dc && !b_chroma ) { int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]]; int predicted_coef = orig_coef - coef; - int psy_weight = b_8x8 ? x264_dct8_weight_tab[zigzag[j]] : x264_dct4_weight_tab[zigzag[j]]; + int psy_weight = coef_weight1[zigzag[j]]; int psy_value0 = h->mb.i_psy_trellis abs(predicted_coef + unquant0 * sign); int psy_value1 = h->mb.i_psy_trellis * abs(predicted_coef + unquant1 * sign); delta_distortion[i] += (psy_value0 - psy_value1) * psy_weight; @@ -778,7 +999,7 @@ bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] ); else x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC ); - score = (int64_t)h->out.bs.i_bits_encoded * i_lambda2; + score = (int64_t)h->out.bs.i_bits_encoded * lambda2; /* QNS loop: pick the change that improves RD the most, apply it, repeat. * coef_mask and round_mask are used to simplify tracking of nonzeroness @@ -790,7 +1011,7 @@ int iter_coef = -1; int iter_mask = coef_mask; int iter_round = round_mask; - for( i = b_ac; i <= i_last_nnz; i++ ) + for( i = b_ac; i <= last_nnz; i++ ) { if( !delta_distortion[i] ) continue; @@ -811,7 +1032,7 @@ bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] ); else x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC ); - cur_score += (int64_t)h->out.bs.i_bits_encoded * i_lambda2; + cur_score += (int64_t)h->out.bs.i_bits_encoded * lambda2; coefs[i] = old_coef; if( cur_score < iter_score ) @@ -839,10 +1060,8 @@ if( coef_mask ) { - for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step ) + for( i = b_ac, j = start; i < num_coefs; i++, j += step ) dct[zigzag[j]] = coefs[i]; - for( ; j <= end; j += step ) - dct[zigzag[j]] = 0; return 1; } @@ -862,11 +1081,12 @@ { if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, - h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED], + h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp], + h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx ); return quant_trellis_cavlc( h, dct, - h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED], + h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED], DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 ); } @@ -892,11 +1112,12 @@ if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, - h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag, + h->quant4_mf[quant_cat][i_qp], h->quant4_bias0[quant_cat][i_qp], + h->unquant4_mf[quant_cat][i_qp], zigzag, DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx ); return quant_trellis_cavlc( h, dct, - h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag, + h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], zigzag, DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 ); } @@ -907,14 +1128,12 @@ int b_ac = ctx_ac[ctx_block_cat]; if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, - h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - x264_dct4_weight2_zigzag[MB_INTERLACED], - x264_zigzag_scan4[MB_INTERLACED], + h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp], + h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx ); return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - x264_dct4_weight2_zigzag[MB_INTERLACED], x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx, 0 ); } @@ -925,9 +1144,8 @@ if( h->param.b_cabac ) { return quant_trellis_cabac( h, dct, - h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], - x264_dct8_weight2_zigzag[MB_INTERLACED], - x264_zigzag_scan8[MB_INTERLACED], + h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias0[i_quant_cat][i_qp], + h->unquant8_mf[i_quant_cat][i_qp], x264_zigzag_scan8[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 64, idx ); } @@ -937,7 +1155,6 @@ { int nz = quant_trellis_cavlc( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], - x264_dct8_weight2_zigzag[MB_INTERLACED], x264_zigzag_scan8[MB_INTERLACED], DCT_LUMA_4x4, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 16, idx4+i, 1 ); / Set up nonzero count for future calls */
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/encoder/slicetype.c ^
@@ -283,7 +283,7 @@ return cost; } -void x264_weights_analyse( x264_t h, x264_frame_t fenc, x264_frame_t ref, int b_lookahead ) +static void x264_weights_analyse( x264_t h, x264_frame_t fenc, x264_frame_t ref, int b_lookahead ) { int i_delta_index = fenc->i_frame - ref->i_frame - 1; /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/filters/video/resize.c ^
@@ -32,9 +32,10 @@ static int full_check( video_info_t info, x264_param_t param ) { int required = 0; - required \|= info->csp != param->i_csp; - required \|= info->width != param->i_width; - required \|= info->height != param->i_height; + required \|= info->csp != param->i_csp; + required \|= info->width != param->i_width; + required \|= info->height != param->i_height; + required \|= info->fullrange != param->vui.b_fullrange; return required; } @@ -44,11 +45,16 @@ #include <libavutil/opt.h> #include <libavutil/pixdesc.h> +#ifndef PIX_FMT_BGRA64 +#define PIX_FMT_BGRA64 PIX_FMT_NONE +#endif + typedef struct { int width; int height; int pix_fmt; + int range; } frame_prop_t; typedef struct @@ -59,6 +65,7 @@ cli_pic_t buffer; int buffer_allocated; int dst_csp; + int input_range; struct SwsContext ctx; uint32_t ctx_flags; / state of swapping chroma planes pre and post resize / @@ -142,62 +149,63 @@ case X264_CSP_YV24: / specially handled via swapping chroma / case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P; case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_RGB48 : PIX_FMT_RGB24; - / the next 3 csps have no equivalent 16bit depth in swscale / + case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_BGR48 : PIX_FMT_BGR24; + case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_BGRA64 : PIX_FMT_BGRA; + / the next csp has no equivalent 16bit depth in swscale / case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE : PIX_FMT_NV12; - case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE : PIX_FMT_BGR24; - case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE : PIX_FMT_BGRA; default: return PIX_FMT_NONE; } } +static int pix_number_of_planes( const AVPixFmtDescriptor pix_desc ) +{ + int num_planes = 0; + for( int i = 0; i < pix_desc->nb_components; i++ ) + { + int plane_plus1 = pix_desc->comp[i].plane + 1; + num_planes = X264_MAX( plane_plus1, num_planes ); + } + return num_planes; +} + static int pick_closest_supported_csp( int csp ) { int pix_fmt = convert_csp_to_pix_fmt( csp ); - switch( pix_fmt ) + // first determine the base csp + int ret = X264_CSP_NONE; + const AVPixFmtDescriptor pix_desc = av_pix_fmt_descriptors+pix_fmt; + if( (unsigned)pix_fmt >= PIX_FMT_NB \|\| !pix_desc->name ) + return ret; + + const char pix_fmt_name = pix_desc->name; + int is_rgb = pix_desc->flags & (PIX_FMT_RGB \| PIX_FMT_PAL); + int is_bgr = !!strstr( pix_fmt_name, "bgr" ); + if( is_bgr \|\| is_rgb ) + { + if( pix_desc->nb_components == 4 ) // has alpha + ret = X264_CSP_BGRA; + else if( is_bgr ) + ret = X264_CSP_BGR; + else + ret = X264_CSP_RGB; + } + else { - case PIX_FMT_YUV420P16LE: - case PIX_FMT_YUV420P16BE: - return X264_CSP_I420 \| X264_CSP_HIGH_DEPTH; - case PIX_FMT_YUV422P: - case PIX_FMT_YUYV422: - case PIX_FMT_UYVY422: - case PIX_FMT_YUVJ422P: - return X264_CSP_I422; - case PIX_FMT_YUV422P16LE: - case PIX_FMT_YUV422P16BE: - return X264_CSP_I422 \| X264_CSP_HIGH_DEPTH; - case PIX_FMT_YUV444P: - case PIX_FMT_YUVJ444P: - return X264_CSP_I444; - case PIX_FMT_YUV444P16LE: - case PIX_FMT_YUV444P16BE: - return X264_CSP_I444 \| X264_CSP_HIGH_DEPTH; - case PIX_FMT_RGB24: - case PIX_FMT_RGB565BE: - case PIX_FMT_RGB565LE: - case PIX_FMT_RGB555BE: - case PIX_FMT_RGB555LE: - return X264_CSP_RGB; - case PIX_FMT_RGB48BE: - case PIX_FMT_RGB48LE: - return X264_CSP_RGB \| X264_CSP_HIGH_DEPTH; - case PIX_FMT_BGR24: - case PIX_FMT_BGR565BE: - case PIX_FMT_BGR565LE: - case PIX_FMT_BGR555BE: - case PIX_FMT_BGR555LE: - return X264_CSP_BGR; - case PIX_FMT_ARGB: - case PIX_FMT_RGBA: - case PIX_FMT_ABGR: - case PIX_FMT_BGRA: - return X264_CSP_BGRA; - case PIX_FMT_NV12: - case PIX_FMT_NV21: - return X264_CSP_NV12; - default: - return X264_CSP_I420; + // yuv-based + if( pix_desc->nb_components == 1 \|\| pix_desc->nb_components == 2 ) // no chroma + ret = X264_CSP_I420; + else if( pix_desc->log2_chroma_w && pix_desc->log2_chroma_h ) // reduced chroma width & height + ret = (pix_desc->nb_components == pix_number_of_planes( pix_desc )) ? X264_CSP_I420 : X264_CSP_NV12; + else if( pix_desc->log2_chroma_w ) // reduced chroma width only + ret = (pix_desc->nb_components == pix_number_of_planes( pix_desc )) ? X264_CSP_I422 : X264_CSP_NV16; + else + ret = X264_CSP_I444; } + // now determine high depth + for( int i = 0; i < pix_desc->nb_components; i++ ) + if( pix_desc->comp[i].depth_minus1 >= 8 ) + ret \|= X264_CSP_HIGH_DEPTH; + return ret; } static int handle_opts( const char optlist, char opts, video_info_t info, resizer_hnd_t h ) @@ -343,57 +351,29 @@ return 0; } -static int handle_jpeg( int format ) -{ - switch( format ) - { - case PIX_FMT_YUVJ420P: - format = PIX_FMT_YUV420P; - return 1; - case PIX_FMT_YUVJ422P: - format = PIX_FMT_YUV422P; - return 1; - case PIX_FMT_YUVJ444P: - format = PIX_FMT_YUV444P; - return 1; - case PIX_FMT_YUVJ440P: - format = PIX_FMT_YUV440P; - return 1; - default: - return 0; - } -} - static int x264_init_sws_context( resizer_hnd_t h ) { + if( h->ctx ) + sws_freeContext( h->ctx ); + h->ctx = sws_alloc_context(); if( !h->ctx ) - { - h->ctx = sws_alloc_context(); - if( !h->ctx ) - return -1; + return -1; - / set flags that will not change / - int dst_format = h->dst.pix_fmt; - int dst_range = handle_jpeg( &dst_format ); - av_set_int( h->ctx, "sws_flags", h->ctx_flags ); - av_set_int( h->ctx, "dstw", h->dst.width ); - av_set_int( h->ctx, "dsth", h->dst.height ); - av_set_int( h->ctx, "dst_format", dst_format ); - av_set_int( h->ctx, "dst_range", dst_range ); / FIXME: use the correct full range value / - } - - int src_format = h->scale.pix_fmt; - int src_range = handle_jpeg( &src_format ); - av_set_int( h->ctx, "srcw", h->scale.width ); - av_set_int( h->ctx, "srch", h->scale.height ); - av_set_int( h->ctx, "src_format", src_format ); - av_set_int( h->ctx, "src_range", src_range ); / FIXME: use the correct full range value / + av_opt_set_int( h->ctx, "sws_flags", h->ctx_flags, 0 ); + av_opt_set_int( h->ctx, "dstw", h->dst.width, 0 ); + av_opt_set_int( h->ctx, "dsth", h->dst.height, 0 ); + av_opt_set_int( h->ctx, "dst_format", h->dst.pix_fmt, 0 ); + av_opt_set_int( h->ctx, "dst_range", h->dst.range, 0 ); + + av_opt_set_int( h->ctx, "srcw", h->scale.width, 0 ); + av_opt_set_int( h->ctx, "srch", h->scale.height, 0 ); + av_opt_set_int( h->ctx, "src_format", h->scale.pix_fmt, 0 ); + av_opt_set_int( h->ctx, "src_range", h->scale.range, 0 ); - / FIXME: use the correct full range values - * FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) / + / FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) / sws_setColorspaceDetails( h->ctx, - sws_getCoefficients( SWS_CS_DEFAULT ), src_range, - sws_getCoefficients( SWS_CS_DEFAULT ), av_get_int( h->ctx, "dst_range", NULL ), + sws_getCoefficients( SWS_CS_DEFAULT ), h->scale.range, + sws_getCoefficients( SWS_CS_DEFAULT ), h->dst.range, 0, 1<<16, 1<<16 ); return sws_init_context( h->ctx, NULL, NULL ) < 0; @@ -401,7 +381,7 @@ static int check_resizer( resizer_hnd_t h, cli_pic_t in ) { - frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ) }; + frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ), h->input_range }; if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) ) return 0; / also warn if the resizer was initialized after the first frame / @@ -440,16 +420,14 @@ h->dst_csp = info->csp; h->dst.width = info->width; h->dst.height = info->height; + h->dst.range = info->fullrange; // maintain input range if( !strcmp( opt_string, "normcsp" ) ) { / only in normalization scenarios is the input capable of changing properties / h->variable_input = 1; h->dst_csp = pick_closest_supported_csp( info->csp ); - / now fix the catch-all i420 choice if it does not allow for the current input resolution dimensions. / - if( h->dst_csp == X264_CSP_I420 && info->width&1 ) - h->dst_csp = X264_CSP_I444; - if( h->dst_csp == X264_CSP_I420 && info->height&1 ) - h->dst_csp = X264_CSP_I422; + FAIL_IF_ERROR( h->dst_csp == X264_CSP_NONE, + "filter get invalid input pixel format %d (colorspace %d)\n", convert_csp_to_pix_fmt( info->csp ), info->csp ) } else if( handle_opts( optlist, opts, info, h ) ) return -1; @@ -459,6 +437,7 @@ h->dst_csp = param->i_csp; h->dst.width = param->i_width; h->dst.height = param->i_height; + h->dst.range = param->vui.b_fullrange; // change to libx264's range } h->ctx_flags = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) ); x264_free_string_array( opts ); @@ -467,6 +446,7 @@ h->ctx_flags \|= SWS_FULL_CHR_H_INT \| SWS_FULL_CHR_H_INP \| SWS_ACCURATE_RND; h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp ); h->scale = h->dst; + h->input_range = info->fullrange; / swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 / int src_csp = info->csp & (X264_CSP_MASK \| X264_CSP_OTHER); @@ -500,6 +480,9 @@ if( h->dst.pix_fmt != src_pix_fmt ) x264_cli_log( NAME, X264_LOG_WARNING, "converting from %s to %s\n", av_get_pix_fmt_name( src_pix_fmt ), av_get_pix_fmt_name( h->dst.pix_fmt ) ); + else if( h->dst.range != h->input_range ) + x264_cli_log( NAME, X264_LOG_WARNING, "converting range from %s to %s\n", + h->input_range ? "PC" : "TV", h->dst.range ? "PC" : "TV" ); h->dst_csp \|= info->csp & X264_CSP_VFLIP; // preserve vflip / if the input is not variable, initialize the context / @@ -511,9 +494,10 @@ } / finished initing, overwrite values / - info->csp = h->dst_csp; - info->width = h->dst.width; - info->height = h->dst.height; + info->csp = h->dst_csp; + info->width = h->dst.width; + info->height = h->dst.height; + info->fullrange = h->dst.range; h->prev_filter = filter; h->prev_hnd = *handle;
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/input/avs.c ^
@@ -235,14 +235,40 @@ "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height ) FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 \|\| info->interlaced) && (vi->height&1), "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height ) - const char arg_name[2] = { NULL, "interlaced" }; - AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) }; char conv_func[14] = { "ConvertTo" }; strcat( conv_func, csp ); - AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, 2 ), arg_name ); + char matrix[7] = ""; + int arg_count = 2; + / if doing a rgb <-> yuv conversion then range is handled via 'matrix'. though it's only supported in 2.56+ / + if( avs_version >= 2.56f && ((opt->output_csp == X264_CSP_RGB && avs_is_yuv( vi )) \|\| (opt->output_csp != X264_CSP_RGB && avs_is_rgb( vi ))) ) + { + // if converting from yuv, then we specify the matrix for the input, otherwise use the output's. + int use_pc_matrix = avs_is_yuv( vi ) ? opt->input_range == RANGE_PC : opt->output_range == RANGE_PC; + strcpy( matrix, use_pc_matrix ? "PC." : "Rec" ); + strcat( matrix, "601" ); / FIXME: use correct coefficients / + arg_count++; + // notification that the input range has changed to the desired one + opt->input_range = opt->output_range; + } + const char arg_name[] = { NULL, "interlaced", "matrix" }; + AVS_Value arg_arr[] = { res, avs_new_value_bool( info->interlaced ), avs_new_value_string( matrix ) }; + AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, arg_count ), arg_name ); FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp ) res = update_clip( h, &vi, res2, res ); } + /* if swscale is not available, change the range if necessary. This only applies to YUV-based CSPs however / + if( avs_is_yuv( vi ) && opt->output_range != RANGE_AUTO && ((opt->input_range == RANGE_PC) != opt->output_range) ) + { + const char levels = opt->output_range ? "TV->PC" : "PC->TV"; + x264_cli_log( "avs", X264_LOG_WARNING, "performing %s conversion\n", levels ); + AVS_Value arg_arr[] = { res, avs_new_value_string( levels ) }; + const char *arg_name[] = { NULL, "levels" }; + AVS_Value res2 = h->func.avs_invoke( h->env, "ColorYUV", avs_new_value_array( arg_arr, 2 ), arg_name ); + FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert range: %s\n", avs_as_error( res2 ) ) + res = update_clip( h, &vi, res2, res ); + // notification that the input range has changed to the desired one + opt->input_range = opt->output_range; + } #endif h->func.avs_release_value( res );
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/input/ffms.c ^
@@ -65,6 +65,18 @@ return 0; } +/* handle the deprecated jpeg pixel formats / +static int handle_jpeg( int csp, int fullrange ) +{ + switch( csp ) + { + case PIX_FMT_YUVJ420P: fullrange = 1; return PIX_FMT_YUV420P; + case PIX_FMT_YUVJ422P: fullrange = 1; return PIX_FMT_YUV422P; + case PIX_FMT_YUVJ444P: fullrange = 1; return PIX_FMT_YUV444P; + default: return csp; + } +} + static int open_file( char psz_filename, hnd_t p_handle, video_info_t info, cli_input_opt_t opt ) { ffms_hnd_t h = calloc( 1, sizeof(ffms_hnd_t) ); @@ -119,11 +131,13 @@ const FFMS_Frame frame = FFMS_GetFrame( h->video_source, 0, &e ); FAIL_IF_ERROR( !frame, "could not read frame 0\n" ) + info->fullrange = 0; info->width = frame->EncodedWidth; info->height = frame->EncodedHeight; - info->csp = frame->EncodedPixelFormat \| X264_CSP_OTHER; + info->csp = handle_jpeg( frame->EncodedPixelFormat, &info->fullrange ) \| X264_CSP_OTHER; info->interlaced = frame->InterlacedFrame; info->tff = frame->TopFieldFirst; + info->fullrange \|= frame->ColorRange == FFMS_CR_JPEG; / ffms timestamps are in milliseconds. ffms also uses int64_ts for timebase, * so we need to reduce large timebases to prevent overflow */
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/input/input.h ^
@@ -42,6 +42,8 @@ int seek; int progress; int output_csp; /* convert to this csp, if applicable / + int output_range; / user desired output range / + int input_range; / user override input range / } cli_input_opt_t; / properties of the source given by the demuxer / @@ -50,6 +52,8 @@ int csp; / colorspace of the input / uint32_t fps_num; uint32_t fps_den; + int fullrange; / has 2^bit_depth-1 instead of 2192^(bit_depth-8) ranges (YUV only) / + int width; int height; int interlaced; int num_frames; @@ -60,7 +64,6 @@ uint32_t timebase_num; uint32_t timebase_den; int vfr; - int width; } video_info_t; /* image data type used by x264cli */
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/input/lavf.c ^
@@ -46,6 +46,18 @@ av_init_packet( pkt );\ } +/* handle the deprecated jpeg pixel formats / +static int handle_jpeg( int csp, int fullrange ) +{ + switch( csp ) + { + case PIX_FMT_YUVJ420P: fullrange = 1; return PIX_FMT_YUV420P; + case PIX_FMT_YUVJ422P: fullrange = 1; return PIX_FMT_YUV422P; + case PIX_FMT_YUVJ444P: fullrange = 1; return PIX_FMT_YUV444P; + default: return csp; + } +} + static int read_frame_internal( cli_pic_t p_pic, lavf_hnd_t h, int i_frame, video_info_t info ) { if( h->first_pic && !info ) @@ -101,14 +113,16 @@ memcpy( p_pic->img.stride, frame.linesize, sizeof(p_pic->img.stride) ); memcpy( p_pic->img.plane, frame.data, sizeof(p_pic->img.plane) ); - p_pic->img.height = c->height; - p_pic->img.csp = c->pix_fmt \| X264_CSP_OTHER; + int is_fullrange = 0; p_pic->img.width = c->width; + p_pic->img.height = c->height; + p_pic->img.csp = handle_jpeg( c->pix_fmt, &is_fullrange ) \| X264_CSP_OTHER; if( info ) { + info->fullrange = is_fullrange; info->interlaced = frame.interlaced_frame; - info->tff = frame.top_field_first; + info->tff = frame.top_field_first; } if( h->vfr_input ) @@ -186,6 +200,7 @@ info->num_frames = h->lavf->streams[i]->nb_frames; info->sar_height = c->sample_aspect_ratio.den; info->sar_width = c->sample_aspect_ratio.num; + info->fullrange \|= c->color_range == AVCOL_RANGE_JPEG; /* avisynth stores rgb data vertically flipped. */ if( !strcasecmp( get_filename_extension( psz_filename ), "avs" ) &&
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/input/timecode.c ^
@@ -368,8 +368,6 @@ timecode_input.picture_alloc = h->input.picture_alloc; timecode_input.picture_clean = h->input.picture_clean; - p_handle = h; - tcfile_in = fopen( psz_filename, "rb" ); FAIL_IF_ERROR( !tcfile_in, "can't open `%s'\n", psz_filename ) else if( !x264_is_regular_file( tcfile_in ) ) @@ -392,6 +390,7 @@ info->timebase_den = h->timebase_den; info->vfr = 1; + p_handle = h; return 0; }
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/tools/checkasm-a.asm ^
@@ -32,8 +32,6 @@ %ifdef WIN64 ; just random numbers to reduce the chance of incidental match ALIGN 16 -n4: dq 0xa77809bf11b239d1 -n5: dq 0x2ba9bf3d2f05b389 x6: ddq 0x79445c159ce790641a1b2550a612b48c x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943 @@ -44,6 +42,14 @@ x13: ddq 0xdd7b8919edd427862e8ec680de14b47c x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf x15: ddq 0x6de8f4c914c334d5011ff554472a7a10 +n7: dq 0x21f86d66c8ca00ce +n8: dq 0x75b6ba21077c48ad +n9: dq 0xed56bb2dcb3c7736 +n10: dq 0x8bda43d3fd1a7e06 +n11: dq 0xb64a9c9e5d318408 +n12: dq 0xdf9a54b303f1d3a3 +n13: dq 0x4a75479abd64e097 +n14: dq 0x249214109d5d1c88 %endif SECTION .text @@ -52,7 +58,7 @@ ; max number of args used by any x264 asm function. ; (max_args % 4) must equal 3 for stack alignment -%define max_args 11 +%define max_args 15 %ifdef WIN64 @@ -60,9 +66,8 @@ ; intptr_t x264_checkasm_call( intptr_t (func)(), int ok, ... ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal checkasm_call, 4,7,16 - sub rsp, max_args8 - %assign stack_offset stack_offset+max_args8 +cglobal checkasm_call, 4,15,16 + SUB rsp, max_args8 mov r6, r0 mov [rsp+stack_offset+16], r1 mov r0, r2 @@ -77,25 +82,30 @@ %endrep %assign i 6 %rep 16-6 - movdqa xmm %+ i, [x %+ i] + mova m %+ i, [x %+ i] + %assign i i+1 +%endrep +%assign i 7 +%rep 15-7 + mov r %+ i, [n %+ i] %assign i i+1 %endrep - mov r4, [n4] - mov r5, [n5] call r6 - xor r4, [n4] - xor r5, [n5] - or r4, r5 - pxor xmm5, xmm5 +%assign i 7 +%rep 15-7 + xor r %+ i, [n %+ i] + or r7, r %+ i + %assign i i+1 +%endrep %assign i 6 %rep 16-6 - pxor xmm %+ i, [x %+ i] - por xmm5, xmm %+ i + pxor m %+ i, [x %+ i] + por m6, m %+ i %assign i i+1 %endrep - packsswb xmm5, xmm5 - movq r5, xmm5 - or r4, r5 + packsswb m6, m6 + movq r5, m6 + or r7, r5 jz .ok mov r4, rax lea r0, [error_message] @@ -104,8 +114,7 @@ mov dword [r1], 0 mov rax, r4 .ok: - add rsp, max_args8 - %assign stack_offset stack_offset-max_args8 + ADD rsp, max_args8 RET %elifndef ARCH_X86_64
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/tools/checkasm.c ^
@@ -55,7 +55,7 @@ #define BENCH_RUNS 100 // tradeoff between accuracy and speed #define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff) #define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions -#define MAX_CPUS 10 // number of different combinations of cpu flags +#define MAX_CPUS 30 // number of different combinations of cpu flags typedef struct { @@ -168,11 +168,10 @@ b->cpu&X264_CPU_XOP ? "xop" : b->cpu&X264_CPU_AVX ? "avx" : b->cpu&X264_CPU_SSE4 ? "sse4" : - b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : /* print sse2slow only if there's also a sse2fast version of the same func / - b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" : + b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" : b->cpu&X264_CPU_SSE2 ? "sse2" : b->cpu&X264_CPU_MMX ? "mmx" : b->cpu&X264_CPU_ALTIVEC ? "altivec" : @@ -180,6 +179,7 @@ b->cpu&X264_CPU_ARMV6 ? "armv6" : "c", b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : + b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" : b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : b->cpu&X264_CPU_LZCNT ? "_lzcnt" : b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : @@ -426,6 +426,10 @@ } report( "pixel hadamard_ac :" ); + // maximize sum + for( int i = 0; i < 32; i++ ) + for( int j = 0; j < 16; j++ ) + pbuf4[16i+j] = -((i+j)&1) & PIXEL_MAX; ok = 1; used_asm = 0; if( pixel_asm.vsad != pixel_ref.vsad ) { @@ -434,13 +438,17 @@ int res_c, res_asm; set_func_name( "vsad" ); used_asm = 1; - res_c = call_c( pixel_c.vsad, pbuf1, 16, h ); - res_asm = call_a( pixel_asm.vsad, pbuf1, 16, h ); - if( res_c != res_asm ) + for( int j = 0; j < 2 && ok; j++ ) { - ok = 0; - fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm ); - break; + pixel p = j ? pbuf4 : pbuf1; + res_c = call_c( pixel_c.vsad, p, 16, h ); + res_asm = call_a( pixel_asm.vsad, p, 16, h ); + if( res_c != res_asm ) + { + ok = 0; + fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm ); + break; + } } } } @@ -516,8 +524,10 @@ set_func_name( #name ); \ used_asm = 1; \ ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \ - ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ) = {0}; \ - ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ) = {0}; \ + ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \ + ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \ + memset( satds_c, 0, 16 sizeof(satds_c) ); \ + memset( satds_a, 0, 16 sizeof(satds_a) ); \ for( int i=0; i<17; i++ ) \ bitcosts[i] = 9(i!=8); \ for( int i=0; i<32; i++ ) \ @@ -649,7 +659,8 @@ { ALIGNED_16( uint16_t sums[72] ); ALIGNED_16( int dc[4] ); - int16_t mvs_a[32], mvs_c[32]; + ALIGNED_16( int16_t mvs_a[32] ); + ALIGNED_16( int16_t mvs_c[32] ); int mvn_a, mvn_c; int thresh = rand() & 0x3fff; set_func_name( "esa_ads" ); @@ -718,8 +729,8 @@ { int cond_a = (i < 2) ? 1 : ((j&3) == 0 \|\| (j&3) == (i-1)); int cond_b = (i == 0) ? 1 : !cond_a; - enc[0] = enc[1] = cond_a ? PIXEL_MAX : 0; - enc[2] = enc[3] = cond_b ? PIXEL_MAX : 0; + enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0; + enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0; for( int k = 0; k < 4; k++ ) dec[k] = PIXEL_MAX - enc[k]; @@ -744,6 +755,12 @@ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ + for( int k = 0; k < size; k++ )\ + printf( "%d ", ((dctcoef)t1)[k] );\ + printf("\n");\ + for( int k = 0; k < size; k++ )\ + printf( "%d ", ((dctcoef)t2)[k] );\ + printf("\n");\ break; \ } \ call_c( dct_c.name, t1, enc, dec ); \ @@ -1554,11 +1571,15 @@ TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] ); TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] ); TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] ); + TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] ); + TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] ); TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] ); TEST_DEBLOCK( deblock_luma_intra[0], 0 ); TEST_DEBLOCK( deblock_luma_intra[1], 1 ); TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 ); TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 ); + TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 ); + TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 ); TEST_DEBLOCK( deblock_chroma_intra[1], 1 ); if( db_a.deblock_strength != db_ref.deblock_strength ) @@ -1998,6 +2019,7 @@ int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \ int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \ if( result_c != result_a \|\| runlevel_c.last != runlevel_a.last \|\| \ + runlevel_c.mask != runlevel_a.mask \|\| \ memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)result_c) \|\| \ memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)(result_c-1)) ) \ { \ @@ -2293,6 +2315,9 @@ { cpu_ref = cpu_new; cpu_new \|= flags; +#if BROKEN_STACK_ALIGNMENT + cpu_new \|= X264_CPU_STACK_MOD4; +#endif if( cpu_new & X264_CPU_SSE2_IS_FAST ) cpu_new &= ~X264_CPU_SSE2_IS_SLOW; if( !quiet ) @@ -2327,6 +2352,7 @@ ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE \| X264_CPU_SSE2 \| X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" ); ret \|= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" ); + cpu1 &= ~X264_CPU_CACHELINE_64; ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" ); cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST; ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); @@ -2336,23 +2362,24 @@ } if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN ) { - cpu1 &= ~X264_CPU_CACHELINE_64; ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" ); cpu1 &= ~X264_CPU_SSE_MISALIGN; } if( x264_cpu_detect() & X264_CPU_LZCNT ) { - cpu1 &= ~X264_CPU_CACHELINE_64; ret \|= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } if( x264_cpu_detect() & X264_CPU_SSE3 ) + { ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 \| X264_CPU_CACHELINE_64, "SSE3" ); + cpu1 &= ~X264_CPU_CACHELINE_64; + } if( x264_cpu_detect() & X264_CPU_SSSE3 ) { - cpu1 &= ~X264_CPU_CACHELINE_64; ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret \|= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); + cpu1 &= ~X264_CPU_CACHELINE_64; ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" ); cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST; ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" ); @@ -2361,10 +2388,7 @@ cpu1 &= ~X264_CPU_SLOW_ATOM; } if( x264_cpu_detect() & X264_CPU_SSE4 ) - { - cpu1 &= ~X264_CPU_CACHELINE_64; - ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); - } + ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 \| X264_CPU_SHUFFLE_IS_FAST, "SSE4" ); if( x264_cpu_detect() & X264_CPU_AVX ) ret \|= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); if( x264_cpu_detect() & X264_CPU_XOP )
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/version.sh ^
@@ -1,4 +1,5 @@ #!/bin/bash +[ -n "$1" ] && cd $1 git rev-list HEAD \| sort > config.git-hash LOCALVER=`wc -l config.git-hash \| awk '{print $1}'` if [ $LOCALVER \> 1 ] ; then
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/x264.c ^
@@ -53,6 +53,7 @@ #endif #if HAVE_SWSCALE +#undef DECLARE_ALIGNED #include <libswscale/swscale.h> #endif @@ -135,6 +136,8 @@ 0 }; +static const char * const range_names[] = { "auto", "tv", "pc", 0 }; + typedef struct { int mod; @@ -347,19 +350,22 @@ size_t line_len = strlen( INDENT ); for( enum PixelFormat i = PIX_FMT_NONE+1; i < PIX_FMT_NB; i++ ) { - const char pfname = av_pix_fmt_descriptors[i].name; - size_t name_len = strlen( pfname ); - if( line_len + name_len > (80 - strlen( ", " )) ) - { - printf( "\n" INDENT ); - line_len = strlen( INDENT ); - } - printf( "%s", pfname ); - line_len += name_len; - if( i+1 < PIX_FMT_NB ) + const char pfname = av_get_pix_fmt_name( i ); + if( pfname ) { - printf( ", " ); - line_len += 2; + size_t name_len = strlen( pfname ); + if( line_len + name_len > (80 - strlen( ", " )) ) + { + printf( "\n" INDENT ); + line_len = strlen( INDENT ); + } + printf( "%s", pfname ); + line_len += name_len; + if( i+1 < PIX_FMT_NB ) + { + printf( ", " ); + line_len += 2; + } } } #endif @@ -734,9 +740,8 @@ H2( " --videoformat <string> Specify video format [\"%s\"]\n" " - component, pal, ntsc, secam, mac, undef\n", strtable_lookup( x264_vidformat_names, defaults->vui.i_vidformat ) ); - H2( " --fullrange <string> Specify full range samples setting [\"%s\"]\n" - " - off, on\n", - strtable_lookup( x264_fullrange_names, defaults->vui.b_fullrange ) ); + H2( " --range <string> Specify color range [\"%s\"]\n" + " - %s\n", range_names[0], stringify_names( buf, range_names ) ); H2( " --colorprim <string> Specify color primaries [\"%s\"]\n" " - undef, bt709, bt470m, bt470bg\n" " smpte170m, smpte240m, film\n", @@ -772,6 +777,8 @@ H1( " --output-csp <string> Specify output colorspace [\"%s\"]\n" " - %s\n", output_csp_names[0], stringify_names( buf, output_csp_names ) ); H1( " --input-depth <integer> Specify input bit depth for raw input\n" ); + H1( " --input-range <string> Specify input color range [\"%s\"]\n" + " - %s\n", range_names[0], stringify_names( buf, range_names ) ); H1( " --input-res <intxint> Specify input resolution (width x height)\n" ); H1( " --index <string> Filename for input index file\n" ); H0( " --sar width:height Specify Sample Aspect Ratio\n" ); @@ -853,7 +860,9 @@ OPT_INPUT_CSP, OPT_INPUT_DEPTH, OPT_DTS_COMPRESSION, - OPT_OUTPUT_CSP + OPT_OUTPUT_CSP, + OPT_INPUT_RANGE, + OPT_RANGE } OptionsOPT; static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw"; @@ -990,7 +999,7 @@ { "cqm8p", required_argument, NULL, 0 }, { "overscan", required_argument, NULL, 0 }, { "videoformat", required_argument, NULL, 0 }, - { "fullrange", required_argument, NULL, 0 }, + { "range", required_argument, NULL, OPT_RANGE }, { "colorprim", required_argument, NULL, 0 }, { "transfer", required_argument, NULL, 0 }, { "colormatrix", required_argument, NULL, 0 }, @@ -1013,6 +1022,7 @@ { "input-depth", required_argument, NULL, OPT_INPUT_DEPTH }, { "dts-compress", no_argument, NULL, OPT_DTS_COMPRESSION }, { "output-csp", required_argument, NULL, OPT_OUTPUT_CSP }, + { "input-range", required_argument, NULL, OPT_INPUT_RANGE }, {0, 0, 0, 0} }; @@ -1176,6 +1186,9 @@ else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR \|\| csp > X264_CSP_RGB) ) param->i_csp = X264_CSP_RGB; param->i_csp \|= info->csp & X264_CSP_HIGH_DEPTH; + /* if the output range is not forced, assign it to the input one now / + if( param->vui.b_fullrange == RANGE_AUTO ) + param->vui.b_fullrange = info->fullrange; if( x264_init_vid_filter( "resize", handle, &filter, info, param, NULL ) ) return -1; @@ -1237,6 +1250,7 @@ memset( &input_opt, 0, sizeof(cli_input_opt_t) ); memset( &output_opt, 0, sizeof(cli_output_opt_t) ); input_opt.bit_depth = 8; + input_opt.input_range = input_opt.output_range = param->vui.b_fullrange = RANGE_AUTO; int output_csp = defaults.i_csp; opt->b_progress = 1; @@ -1402,6 +1416,14 @@ #endif param->i_csp = output_csp = output_csp_fix[output_csp]; break; + case OPT_INPUT_RANGE: + FAIL_IF_ERROR( parse_enum_value( optarg, range_names, &input_opt.input_range ), "Unknown input range `%s'\n", optarg ) + input_opt.input_range += RANGE_AUTO; + break; + case OPT_RANGE: + FAIL_IF_ERROR( parse_enum_value( optarg, range_names, &param->vui.b_fullrange ), "Unknown range `%s'\n", optarg ); + input_opt.output_range = param->vui.b_fullrange += RANGE_AUTO; + break; default: generic_option: { @@ -1452,10 +1474,11 @@ video_info_t info = {0}; char demuxername[5]; - / set info flags to param flags to be overwritten by demuxer as necessary. / + / set info flags to be overwritten by demuxer as necessary. / info.csp = param->i_csp; info.fps_num = param->i_fps_num; info.fps_den = param->i_fps_den; + info.fullrange = input_opt.input_range == RANGE_PC; info.interlaced = param->b_interlaced; info.sar_width = param->vui.i_sar_width; info.sar_height = param->vui.i_sar_height; @@ -1540,6 +1563,8 @@ info.interlaced = param->b_interlaced; info.tff = param->b_tff; } + if( input_opt.input_range != RANGE_AUTO ) + info.fullrange = input_opt.input_range; if( init_vid_filters( vid_filters, &opt->hin, &info, param, output_csp ) ) return -1; @@ -1571,6 +1596,15 @@ x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, but not compiled with interlaced support\n" ); #endif } + / if the user never specified the output range and the input is now rgb, default it to pc / + int csp = param->i_csp & X264_CSP_MASK; + if( csp >= X264_CSP_BGR && csp <= X264_CSP_RGB ) + { + if( input_opt.output_range == RANGE_AUTO ) + param->vui.b_fullrange = RANGE_PC; + / otherwise fail if they specified tv / + FAIL_IF_ERROR( !param->vui.b_fullrange, "RGB must be PC range" ) + } / Automatically reduce reference frame count to match the user's target level * if the user didn't explicitly set a reference frame count. */
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/x264.h ^
@@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 119 +#define X264_BUILD 120 /* x264_t: * opaque handler for encoder / @@ -724,8 +724,7 @@ x264_hrd_t hrd_timing; / In: arbitrary user SEI (e.g subtitles, AFDs) / x264_sei_t extra_sei; - / private user data. libx264 doesn't touch this, - not even copy it from input to output frames. / + / private user data. copied from input to output frames. / void opaque; } x264_picture_t;
[-] [+]	Changed	x264-snapshot-20120126-2245.tar.bz2/x264cli.h ^
@@ -72,4 +72,11 @@ #define FAIL_IF_ERR( cond, name, ... ) RETURN_IF_ERR( cond, name, -1, __VA_ARGS__ ) +typedef enum +{ + RANGE_AUTO = -1, + RANGE_TV, + RANGE_PC +} range_enum; + #endif