[-]
[+]
|
Changed |
x264.changes
|
|
[-]
[+]
|
Changed |
x264.spec
^
|
|
[-]
[+]
|
Changed |
x264-use-shared-library.patch
^
|
@@ -1,15 +1,21 @@
---- Makefile.orig 2011-05-27 22:45:04.000000000 +0200
-+++ Makefile 2011-05-28 15:18:29.883305471 +0200
-@@ -149,9 +149,10 @@
+--- Makefile.orig 2011-12-26 22:45:03.000000000 +0100
++++ Makefile 2011-12-27 20:03:46.070404383 +0100
+@@ -152,6 +152,7 @@
$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
+ ln -s $(SONAME) libx264.so
+ ifneq ($(EXE),)
+ .PHONY: x264 checkasm
+@@ -159,8 +160,8 @@
+ checkasm: checkasm$(EXE)
+ endif
+
-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
- $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
+x264$(EXE): .depend $(OBJCLI) $(SONAME)
+ $(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS)
- checkasm: tools/checkasm.o $(LIBX264)
- $(LD)$@ $+ $(LDFLAGS)
+ checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+ $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/Makefile
^
|
@@ -2,6 +2,11 @@
include config.mak
+vpath %.c $(SRCPATH)
+vpath %.h $(SRCPATH)
+vpath %.S $(SRCPATH)
+vpath %.asm $(SRCPATH)
+
all: default
SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
@@ -22,6 +27,8 @@
SRCSO =
+OBJCHK = tools/checkasm.o
+
CONFIG := $(shell cat config.h)
# GPL-only files
@@ -79,16 +86,16 @@
ifeq ($(ARCH),X86_64)
ARCH_X86 = yes
-ASMSRC = $(X86SRC:-32.asm=-64.asm)
+ASMSRC = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm
ASFLAGS += -DARCH_X86_64
endif
ifdef ARCH_X86
-ASFLAGS += -Icommon/x86/
+ASFLAGS += -I$(SRCPATH)/common/x86/
SRCS += common/x86/mc-c.c common/x86/predict-c.c
OBJASM = $(ASMSRC:%.asm=%.o)
$(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm
-checkasm: tools/checkasm-a.o
+OBJCHK += tools/checkasm-a.o
endif
endif
@@ -135,7 +142,7 @@
OBJSO = $(SRCSO:%.c=%.o)
DEP = depend
-.PHONY: all default fprofiled clean distclean install uninstall dox test testclean lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli
+.PHONY: all default fprofiled clean distclean install uninstall lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli
default: $(DEP)
@@ -144,17 +151,26 @@
lib-shared: $(SONAME)
$(LIBX264): .depend $(OBJS) $(OBJASM)
+ rm -f $(LIBX264)
$(AR)$@ $(OBJS) $(OBJASM)
$(if $(RANLIB), $(RANLIB) $@)
$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
+ifneq ($(EXE),)
+.PHONY: x264 checkasm
+x264: x264$(EXE)
+checkasm: checkasm$(EXE)
+endif
+
x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
-checkasm: tools/checkasm.o $(LIBX264)
- $(LD)$@ $+ $(LDFLAGS)
+checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+ $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
+
+$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
@@ -166,7 +182,7 @@
.depend: config.mak
@rm -f .depend
- @$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:%.c=%.o) $(DEPMM) 1>> .depend;)
+ @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
config.mak:
./configure
@@ -204,12 +220,11 @@
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
- rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
+ rm -f checkasm checkasm.exe $(OBJCHK)
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
distclean: clean
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
- rm -rf test/
install-cli: cli
install -d $(DESTDIR)$(bindir)
@@ -219,7 +234,7 @@
install -d $(DESTDIR)$(includedir)
install -d $(DESTDIR)$(libdir)
install -d $(DESTDIR)$(libdir)/pkgconfig
- install -m 644 x264.h $(DESTDIR)$(includedir)
+ install -m 644 $(SRCPATH)/x264.h $(DESTDIR)$(includedir)
install -m 644 x264_config.h $(DESTDIR)$(includedir)
install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/arm/predict-a.S
^
|
@@ -102,6 +102,21 @@
bx lr
.endfunc
+function x264_predict_4x4_dc_top_neon
+ mov r12, #FDEC_STRIDE
+ sub r1, r0, #FDEC_STRIDE
+ vld1.32 d1[], [r1,:32]
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1, d1
+ vrshr.u16 d1, d1, #2
+ vdup.8 d1, d1[0]
+ vst1.32 d1[0], [r0,:32], r12
+ vst1.32 d1[0], [r0,:32], r12
+ vst1.32 d1[0], [r0,:32], r12
+ vst1.32 d1[0], [r0,:32], r12
+ bx lr
+.endfunc
+
// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
uhadd8 \a1, \a1, \c1
@@ -211,6 +226,202 @@
bx lr
.endfunc
+function x264_predict_8x8_v_neon
+ add r1, r1, #16
+ mov r12, #FDEC_STRIDE
+ vld1.8 {d0}, [r1,:64]
+.rept 8
+ vst1.8 {d0}, [r0,:64], r12
+.endr
+ bx lr
+.endfunc
+
+function x264_predict_8x8_ddl_neon
+ add r1, #16
+ vld1.8 {d0, d1}, [r1,:128]
+ vmov.i8 q3, #0
+ vrev64.8 d2, d1
+ vext.8 q8, q3, q0, #15
+ vext.8 q2, q0, q1, #1
+ vhadd.u8 q8, q2
+ mov r12, #FDEC_STRIDE
+ vrhadd.u8 q0, q8
+ vext.8 d2, d0, d1, #1
+ vext.8 d3, d0, d1, #2
+ vst1.8 d2, [r0,:64], r12
+ vext.8 d2, d0, d1, #3
+ vst1.8 d3, [r0,:64], r12
+ vext.8 d3, d0, d1, #4
+ vst1.8 d2, [r0,:64], r12
+ vext.8 d2, d0, d1, #5
+ vst1.8 d3, [r0,:64], r12
+ vext.8 d3, d0, d1, #6
+ vst1.8 d2, [r0,:64], r12
+ vext.8 d2, d0, d1, #7
+ vst1.8 d3, [r0,:64], r12
+ vst1.8 d2, [r0,:64], r12
+ vst1.8 d1, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_ddr_neon
+ vld1.8 {d0-d3}, [r1,:128]
+ vext.8 q2, q0, q1, #7
+ vext.8 q3, q0, q1, #9
+
+ vhadd.u8 q2, q2, q3
+ vrhadd.u8 d0, d1, d4
+ vrhadd.u8 d1, d2, d5
+
+ add r0, #7*FDEC_STRIDE
+ mov r12, #-1*FDEC_STRIDE
+
+ vext.8 d2, d0, d1, #1
+ vst1.8 {d0}, [r0,:64], r12
+ vext.8 d4, d0, d1, #2
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d5, d0, d1, #3
+ vst1.8 {d4}, [r0,:64], r12
+ vext.8 d4, d0, d1, #4
+ vst1.8 {d5}, [r0,:64], r12
+ vext.8 d5, d0, d1, #5
+ vst1.8 {d4}, [r0,:64], r12
+ vext.8 d4, d0, d1, #6
+ vst1.8 {d5}, [r0,:64], r12
+ vext.8 d5, d0, d1, #7
+ vst1.8 {d4}, [r0,:64], r12
+ vst1.8 {d5}, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_vl_neon
+ add r1, #16
+ mov r12, #FDEC_STRIDE
+
+ vld1.8 {d0, d1}, [r1,:128]
+ vext.8 q1, q1, q0, #15
+ vext.8 q2, q0, q2, #1
+
+ vrhadd.u8 q3, q0, q2
+
+ vhadd.u8 q1, q1, q2
+ vrhadd.u8 q0, q0, q1
+
+ vext.8 d2, d0, d1, #1
+ vst1.8 {d6}, [r0,:64], r12
+ vext.8 d3, d6, d7, #1
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #2
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d3, d6, d7, #2
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #3
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d3, d6, d7, #3
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #4
+ vst1.8 {d3}, [r0,:64], r12
+ vst1.8 {d2}, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_vr_neon
+ add r1, #8
+ mov r12, #FDEC_STRIDE
+ vld1.8 {d4,d5}, [r1,:64]
+
+ vext.8 q1, q2, q2, #14
+ vext.8 q0, q2, q2, #15
+
+ vhadd.u8 q3, q2, q1
+ vrhadd.u8 q2, q2, q0
+ vrhadd.u8 q0, q0, q3
+
+ vmov d2, d0
+
+ vst1.8 {d5}, [r0,:64], r12
+ vuzp.8 d2, d0
+ vst1.8 {d1}, [r0,:64], r12
+ vext.8 d6, d0, d5, #7
+ vext.8 d3, d2, d1, #7
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d6, d0, d5, #6
+ vext.8 d3, d2, d1, #6
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d6, d0, d5, #5
+ vext.8 d3, d2, d1, #5
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d3}, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_hd_neon
+ mov r12, #FDEC_STRIDE
+ add r1, #7
+
+ vld1.8 {d2,d3}, [r1]
+ vext.8 q3, q1, q1, #1
+ vext.8 q2, q1, q1, #2
+
+ vrhadd.u8 q8, q1, q3
+
+ vhadd.u8 q1, q2
+ vrhadd.u8 q0, q1, q3
+
+ vzip.8 d16, d0
+
+ vext.8 d2, d0, d1, #6
+ vext.8 d3, d0, d1, #4
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #2
+ vst1.8 {d3}, [r0,:64], r12
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d16, d0, #6
+ vst1.8 {d0}, [r0,:64], r12
+ vext.8 d3, d16, d0, #4
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d16, d0, #2
+ vst1.8 {d3}, [r0,:64], r12
+ vst1.8 {d2}, [r0,:64], r12
+ vst1.8 {d16}, [r0,:64], r12
+
+ bx lr
+.endfunc
+
+function x264_predict_8x8_hu_neon
+ mov r12, #FDEC_STRIDE
+ add r1, #7
+ vld1.8 {d7}, [r1]
+ vdup.8 d6, d7[0]
+ vrev64.8 d7, d7
+
+ vext.8 d4, d7, d6, #2
+ vext.8 d2, d7, d6, #1
+
+ vhadd.u8 d16, d7, d4
+ vrhadd.u8 d0, d2, d7
+ vrhadd.u8 d1, d16, d2
+
+ vzip.8 d0, d1
+
+ vdup.16 q1, d1[3]
+
+ vext.8 q2, q0, q1, #2
+ vext.8 q3, q0, q1, #4
+ vext.8 q8, q0, q1, #6
+ vst1.8 {d0}, [r0,:64], r12
+ vst1.8 {d4}, [r0,:64], r12
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d16}, [r0,:64], r12
+
+ vst1.8 {d1}, [r0,:64], r12
+ vst1.8 {d5}, [r0,:64], r12
+ vst1.8 {d7}, [r0,:64], r12
+ vst1.8 {d17}, [r0,:64]
+ bx lr
+.endfunc
function x264_predict_8x8c_dc_top_neon
sub r2, r0, #FDEC_STRIDE
@@ -223,7 +434,7 @@
vdup.8 d0, d0[0]
vtrn.32 d0, d1
b pred8x8_dc_end
- .endfunc
+.endfunc
function x264_predict_8x8c_dc_left_neon
mov r1, #FDEC_STRIDE
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/arm/predict-c.c
^
|
@@ -28,6 +28,7 @@
#include "pixel.h"
void x264_predict_4x4_dc_armv6( uint8_t *src );
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
void x264_predict_4x4_h_armv6( uint8_t *src );
void x264_predict_4x4_ddr_armv6( uint8_t *src );
void x264_predict_4x4_ddl_neon( uint8_t *src );
@@ -40,7 +41,14 @@
void x264_predict_8x8c_p_neon( uint8_t *src );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_16x16_dc_neon( uint8_t *src );
void x264_predict_16x16_dc_top_neon( uint8_t *src );
@@ -62,6 +70,7 @@
if (!(cpu&X264_CPU_NEON))
return;
+ pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
#endif // !HIGH_BIT_DEPTH
}
@@ -87,8 +96,15 @@
return;
#if !HIGH_BIT_DEPTH
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
#endif // !HIGH_BIT_DEPTH
}
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/bitstream.h
^
|
@@ -56,6 +56,7 @@
typedef struct
{
int last;
+ int mask;
dctcoef level[16];
uint8_t run[16];
} x264_run_level_t;
@@ -65,7 +66,6 @@
extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_2x2_dc[3][4];
extern const vlc_t x264_total_zeros_2x4_dc[7][8];
-extern const vlc_t x264_run_before[7][16];
typedef struct
{
@@ -82,6 +82,11 @@
#define LEVEL_TABLE_SIZE 128
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+/* The longest possible set of zero run codes sums to 25 bits. This leaves
+ * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
+
+extern uint32_t x264_run_before[1<<16];
+
static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
int offset = ((intptr_t)p_data & 3);
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/common.h
^
|
@@ -236,7 +236,7 @@
void x264_reduce_fraction( uint32_t *n, uint32_t *d );
void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
-void x264_cavlc_init( void );
+void x264_cavlc_init( x264_t *h );
void x264_cabac_init( x264_t *h );
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
@@ -498,6 +498,8 @@
udctcoef (*quant8_mf[4])[64]; /* [4][52][64] */
udctcoef (*quant4_bias[4])[16]; /* [4][52][16] */
udctcoef (*quant8_bias[4])[64]; /* [4][52][64] */
+ udctcoef (*quant4_bias0[4])[16]; /* [4][52][16] */
+ udctcoef (*quant8_bias0[4])[64]; /* [4][52][64] */
udctcoef (*nr_offset_emergency)[4][64];
/* mv/ref cost arrays. */
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/dct.c
^
|
@@ -36,8 +36,69 @@
# include "arm/dct.h"
#endif
-uint16_t x264_dct4_weight2_zigzag[2][16];
-uint16_t x264_dct8_weight2_zigzag[2][64];
+/* the inverse of the scaling factors introduced by 8x8 fdct */
+/* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
+#define W(i) (i==0 ? FIX8(1.0000) :\
+ i==1 ? FIX8(0.8859) :\
+ i==2 ? FIX8(1.6000) :\
+ i==3 ? FIX8(0.9415) :\
+ i==4 ? FIX8(1.2651) :\
+ i==5 ? FIX8(1.1910) :0)
+const uint32_t x264_dct8_weight_tab[64] = {
+ W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
+ W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
+
+ W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
+ W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
+};
+#undef W
+
+#define W(i) (i==0 ? FIX8(1.76777) :\
+ i==1 ? FIX8(1.11803) :\
+ i==2 ? FIX8(0.70711) :0)
+const uint32_t x264_dct4_weight_tab[16] = {
+ W(0), W(1), W(0), W(1),
+ W(1), W(2), W(1), W(2),
+ W(0), W(1), W(0), W(1),
+ W(1), W(2), W(1), W(2)
+};
+#undef W
+
+/* inverse squared */
+#define W(i) (i==0 ? FIX8(3.125) :\
+ i==1 ? FIX8(1.25) :\
+ i==2 ? FIX8(0.5) :0)
+const uint32_t x264_dct4_weight2_tab[16] = {
+ W(0), W(1), W(0), W(1),
+ W(1), W(2), W(1), W(2),
+ W(0), W(1), W(0), W(1),
+ W(1), W(2), W(1), W(2)
+};
+#undef W
+
+#define W(i) (i==0 ? FIX8(1.00000) :\
+ i==1 ? FIX8(0.78487) :\
+ i==2 ? FIX8(2.56132) :\
+ i==3 ? FIX8(0.88637) :\
+ i==4 ? FIX8(1.60040) :\
+ i==5 ? FIX8(1.41850) :0)
+const uint32_t x264_dct8_weight2_tab[64] = {
+ W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
+ W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
+
+ W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
+ W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
+ W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
+};
+#undef W
+
static void dct4x4dc( dctcoef d[16] )
{
@@ -509,19 +570,35 @@
dctf->add4x4_idct = x264_add4x4_idct_sse2;
dctf->dct4x4dc = x264_dct4x4dc_sse2;
dctf->idct4x4dc = x264_idct4x4dc_sse2;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
+ dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
+ dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
}
+ if( cpu&X264_CPU_SSE4 )
+ {
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
+ }
if( cpu&X264_CPU_AVX )
{
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->dct4x4dc = x264_dct4x4dc_avx;
dctf->idct4x4dc = x264_idct4x4dc_avx;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
dctf->add16x16_idct = x264_add16x16_idct_avx;
+ dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
+ dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
}
#endif // HAVE_MMX
@@ -555,6 +632,7 @@
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
+ dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
@@ -572,6 +650,7 @@
dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
@@ -592,6 +671,12 @@
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
}
+
+ if( cpu&X264_CPU_XOP )
+ {
+ dctf->sub8x8_dct = x264_sub8x8_dct_xop;
+ dctf->sub16x16_dct = x264_sub16x16_dct_xop;
+ }
#endif //HAVE_MMX
#if HAVE_ALTIVEC
@@ -639,17 +724,6 @@
#endif // HIGH_BIT_DEPTH
}
-void x264_dct_init_weights( void )
-{
- for( int j = 0; j < 2; j++ )
- {
- for( int i = 0; i < 16; i++ )
- x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
- for( int i = 0; i < 64; i++ )
- x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
- }
-}
-
#define ZIG(i,y,x) level[i] = dct[x*8+y];
#define ZIGZAG8_FRAME\
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/dct.h
^
|
@@ -26,70 +26,10 @@
#ifndef X264_DCT_H
#define X264_DCT_H
-/* the inverse of the scaling factors introduced by 8x8 fdct */
-#define W(i) (i==0 ? FIX8(1.0000) :\
- i==1 ? FIX8(0.8859) :\
- i==2 ? FIX8(1.6000) :\
- i==3 ? FIX8(0.9415) :\
- i==4 ? FIX8(1.2651) :\
- i==5 ? FIX8(1.1910) :0)
-static const uint16_t x264_dct8_weight_tab[64] = {
- W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
- W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
-
- W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
- W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
-};
-#undef W
-
-#define W(i) (i==0 ? FIX8(1.76777) :\
- i==1 ? FIX8(1.11803) :\
- i==2 ? FIX8(0.70711) :0)
-static const uint16_t x264_dct4_weight_tab[16] = {
- W(0), W(1), W(0), W(1),
- W(1), W(2), W(1), W(2),
- W(0), W(1), W(0), W(1),
- W(1), W(2), W(1), W(2)
-};
-#undef W
-
-/* inverse squared */
-#define W(i) (i==0 ? FIX8(3.125) :\
- i==1 ? FIX8(1.25) :\
- i==2 ? FIX8(0.5) :0)
-static const uint16_t x264_dct4_weight2_tab[16] = {
- W(0), W(1), W(0), W(1),
- W(1), W(2), W(1), W(2),
- W(0), W(1), W(0), W(1),
- W(1), W(2), W(1), W(2)
-};
-#undef W
-
-#define W(i) (i==0 ? FIX8(1.00000) :\
- i==1 ? FIX8(0.78487) :\
- i==2 ? FIX8(2.56132) :\
- i==3 ? FIX8(0.88637) :\
- i==4 ? FIX8(1.60040) :\
- i==5 ? FIX8(1.41850) :0)
-static const uint16_t x264_dct8_weight2_tab[64] = {
- W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
- W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
-
- W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
- W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
- W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
-};
-#undef W
-
-extern uint16_t x264_dct4_weight2_zigzag[2][16]; // [2] = {frame, field}
-extern uint16_t x264_dct8_weight2_zigzag[2][64];
+extern const uint32_t x264_dct4_weight_tab[16];
+extern const uint32_t x264_dct8_weight_tab[64];
+extern const uint32_t x264_dct4_weight2_tab[16];
+extern const uint32_t x264_dct8_weight2_tab[64];
typedef struct
{
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/deblock.c
^
|
@@ -165,13 +165,7 @@
}
static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
- for( int i = 0; i < 4; i++, pix += stride )
- deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] );
-}
-static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
-{
- for( int i = 0; i < 8; i++, pix += stride )
- deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i>>1] );
+ deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
@@ -265,13 +259,7 @@
}
static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
{
- for( int i = 0; i < 4; i++, pix += stride )
- deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
-}
-static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
-{
- for( int i = 0; i < 8; i++, pix += stride )
- deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
+ deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
@@ -474,13 +462,15 @@
{
deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_intra_deblock );
deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
- deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
+ if( chroma444 )
+ deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
}
else
{
deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_deblock );
deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
- deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
+ if( chroma444 )
+ deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
}
int offy = MB_INTERLACED ? 4 : 0;
@@ -492,13 +482,15 @@
{
deblock_edge_intra( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], a, b, 0, luma_intra_deblock );
deblock_edge_intra( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
- deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
+ if( chroma444 )
+ deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
}
else
{
deblock_edge( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], a, b, 0, luma_deblock );
deblock_edge( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
- deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
+ if( chroma444 )
+ deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
}
}
else
@@ -647,6 +639,8 @@
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
@@ -658,6 +652,9 @@
void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
@@ -670,15 +667,21 @@
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
+
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, int stride, int alpha, int beta );
#if ARCH_X86
void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
#if HIGH_BIT_DEPTH
void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
@@ -725,10 +728,8 @@
pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
- pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c;
pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
- pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c;
pf->deblock_strength = deblock_strength_c;
#if HAVE_MMX
@@ -739,22 +740,26 @@
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
-#if !HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
-#endif
+ pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
+ pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
+#endif
+#if !HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
-#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
-#endif
+ pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
+ pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
@@ -764,6 +769,9 @@
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
+#if HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2;
+#endif
}
}
if( cpu&X264_CPU_SSSE3 )
@@ -772,9 +780,8 @@
{
pf->deblock_strength = x264_deblock_strength_avx;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
-#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
-#endif
+ pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
@@ -784,6 +791,10 @@
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
+#if HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx;
+ pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx;
+#endif
}
}
}
@@ -808,4 +819,8 @@
}
#endif
#endif // !HIGH_BIT_DEPTH
+
+ /* These functions are equivalent, so don't duplicate them. */
+ pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420;
+ pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/frame.c
^
|
@@ -353,6 +353,7 @@
dst->param = src->param;
dst->i_pic_struct = src->i_pic_struct;
dst->extra_sei = src->extra_sei;
+ dst->opaque = src->opaque;
uint8_t *pix[3];
int stride[3];
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/frame.h
^
|
@@ -162,6 +162,9 @@
/* user sei */
x264_sei_t extra_sei;
+
+ /* user data */
+ void *opaque;
} x264_frame_t;
/* synchronized frame list */
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/mc.c
^
|
@@ -304,9 +304,9 @@
}
}
-void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
- pixel *dstv, int i_dstv,
- pixel *src, int i_src, int w, int h )
+static void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
+ pixel *dstv, int i_dstv,
+ pixel *src, int i_src, int w, int h )
{
for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
for( int x=0; x<w; x++ )
@@ -316,10 +316,10 @@
}
}
-void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
- pixel *dstb, int i_dstb,
- pixel *dstc, int i_dstc,
- pixel *src, int i_src, int pw, int w, int h )
+static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
+ pixel *dstb, int i_dstb,
+ pixel *dstc, int i_dstc,
+ pixel *src, int i_src, int pw, int w, int h )
{
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
{
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/osdep.h
^
|
@@ -31,15 +31,10 @@
#define _FILE_OFFSET_BITS 64
#include <stdio.h>
#include <sys/stat.h>
+#include <inttypes.h>
#include "config.h"
-#if HAVE_STDINT_H
-#include <stdint.h>
-#else
-#include <inttypes.h>
-#endif
-
#if !HAVE_LOG2F
#define log2f(x) (logf(x)/0.693147180559945f)
#define log2(x) (log(x)/0.693147180559945)
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/pixel.c
^
|
@@ -522,8 +522,6 @@
INTRA_MBCMP_8x8(sa8d,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP_8x8( sad, _mmx2, _c )
-INTRA_MBCMP_8x8( sad, _sse2, _sse2 )
-INTRA_MBCMP_8x8( sad, _ssse3, _sse2 )
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
#endif
@@ -550,14 +548,10 @@
#if HAVE_MMX
#if HIGH_BIT_DEPTH
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
-INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
-INTRA_MBCMP(satd, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
-INTRA_MBCMP(satd, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
-INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
#else
@@ -865,6 +859,7 @@
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
@@ -872,6 +867,7 @@
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
@@ -884,7 +880,7 @@
{
INIT4( hadamard_ac, _sse2 );
}
-
+ pixf->vsad = x264_pixel_vsad_sse2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
@@ -911,7 +907,8 @@
{
INIT4( hadamard_ac, _ssse3 );
}
-
+ pixf->vsad = x264_pixel_vsad_ssse3;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
@@ -935,6 +932,7 @@
{
INIT4( hadamard_ac, _avx );
}
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
@@ -943,6 +941,10 @@
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
}
+ if( cpu&X264_CPU_XOP )
+ {
+ pixf->vsad = x264_pixel_vsad_xop;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/quant.c
^
|
@@ -373,14 +373,17 @@
{\
int i_last = runlevel->last = x264_coeff_last##num(dct);\
int i_total = 0;\
+ int mask = 0;\
do\
{\
int r = 0;\
runlevel->level[i_total] = dct[i_last];\
+ mask |= 1 << (i_last);\
while( --i_last >= 0 && dct[i_last] == 0 )\
r++;\
runlevel->run[i_total++] = r;\
} while( i_last >= 0 );\
+ runlevel->mask = mask;\
return i_total;\
}
@@ -389,6 +392,18 @@
level_run(15)
level_run(16)
+#if ARCH_X86_64
+#define INIT_TRELLIS(cpu)\
+ pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\
+ pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\
+ pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\
+ pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\
+ pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\
+ pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu;
+#else
+#define INIT_TRELLIS(...)
+#endif
+
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
@@ -423,6 +438,7 @@
#if HIGH_BIT_DEPTH
#if HAVE_MMX
+ INIT_TRELLIS( sse2 );
if( cpu&X264_CPU_MMX2 )
{
#if ARCH_X86
@@ -500,6 +516,7 @@
pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_ssse3;
+ INIT_TRELLIS( ssse3 );
}
if( cpu&X264_CPU_SSE4 )
{
@@ -524,6 +541,7 @@
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
+ INIT_TRELLIS( sse2 );
if( cpu&X264_CPU_MMX )
{
#if ARCH_X86
@@ -627,6 +645,7 @@
pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_ssse3;
+ INIT_TRELLIS( ssse3 );
}
if( cpu&X264_CPU_SSE4 )
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/quant.h
^
|
@@ -55,6 +55,17 @@
int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel );
int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel );
int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel );
+
+#define TRELLIS_PARAMS const int *unquant_mf, const uint8_t *zigzag, int lambda2,\
+ int last_nnz, dctcoef *coefs, dctcoef *quant_coefs, dctcoef *dct,\
+ uint8_t *cabac_state_sig, uint8_t *cabac_state_last,\
+ uint64_t level_state0, uint16_t level_state1
+ int (*trellis_cabac_4x4)( TRELLIS_PARAMS, int b_ac );
+ int (*trellis_cabac_8x8)( TRELLIS_PARAMS, int b_interlaced );
+ int (*trellis_cabac_4x4_psy)( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int psy_trellis );
+ int (*trellis_cabac_8x8_psy)( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int psy_trellis );
+ int (*trellis_cabac_dc)( TRELLIS_PARAMS, int num_coefs );
+ int (*trellis_cabac_chroma_422_dc)( TRELLIS_PARAMS );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/rectangle.c
^
|
@@ -26,7 +26,7 @@
#include "common.h"
#define CACHE_FUNC(name,size,width,height)\
-void x264_macroblock_cache_##name##_##width##_##height( void *target, uint32_t val )\
+static void x264_macroblock_cache_##name##_##width##_##height( void *target, uint32_t val )\
{\
x264_macroblock_cache_rect( target, width*size, height, size, val );\
}
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/set.c
^
|
@@ -112,9 +112,15 @@
!memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
break;
if( j < i )
+ {
h->quant4_bias[i] = h->quant4_bias[j];
+ h->quant4_bias0[i] = h->quant4_bias0[j];
+ }
else
+ {
CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );
+ CHECKED_MALLOC( h->quant4_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );
+ }
}
for( int q = 0; q < 6; q++ )
@@ -163,6 +169,7 @@
}
// round to nearest, unless that would cause the deadzone to be negative
h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+ h->quant4_bias0[i_list][q][i] = (1<<15)/j;
if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
max_qp_err = q;
if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_4IC || i_list == CQM_4PC) )
@@ -182,6 +189,7 @@
continue;
}
h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+ h->quant8_bias0[i_list][q][i] = (1<<15)/j;
if( j > 0xffff && q > max_qp_err && (i_list == CQM_8IY || i_list == CQM_8PY) )
max_qp_err = q;
if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_8IC || i_list == CQM_8PC) )
@@ -272,7 +280,10 @@
if( h->quant##n##_bias[i] == h->quant##n##_bias[j] )\
break;\
if( j == i )\
+ {\
x264_free( h->quant##n##_bias[i] );\
+ x264_free( h->quant##n##_bias0[i] );\
+ }\
}
void x264_cqm_delete( x264_t *h )
@@ -351,8 +362,8 @@
b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_LUMA", h->param.cqm_8py, x264_cqm_jvt8p, 64 );
if( CHROMA444 )
{
- b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8iy, x264_cqm_jvt8i, 64 );
- b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8py, x264_cqm_jvt8p, 64 );
+ b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8ic, x264_cqm_jvt8i, 64 );
+ b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8pc, x264_cqm_jvt8p, 64 );
}
x264_free( buf );
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/vlc.c
^
|
@@ -738,7 +738,7 @@
};
/* [MIN( i_zero_left-1, 6 )][run_before] */
-const vlc_t x264_run_before[7][16] =
+static const vlc_t run_before[7][16] =
{
{ /* i_zero_left 1 */
{ 0x1, 1 }, /* str=1 */
@@ -799,8 +799,9 @@
};
vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+uint32_t x264_run_before[1<<16];
-void x264_cavlc_init( void )
+void x264_cavlc_init( x264_t *h )
{
for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
@@ -840,4 +841,27 @@
i_next++;
vlc->i_next = i_next;
}
+
+ for( int i = 1; i < (1<<16); i++ )
+ {
+ x264_run_level_t runlevel;
+ ALIGNED_ARRAY_16( dctcoef, dct, [16] );
+ int size = 0;
+ int bits = 0;
+ for( int j = 0; j < 16; j++ )
+ dct[j] = i&(1<<j);
+ int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );
+ int zeros = runlevel.last + 1 - total;
+ for( int j = 0; j < total-1 && zeros > 0; j++ )
+ {
+ int idx = X264_MIN(zeros, 7) - 1;
+ int run = runlevel.run[j];
+ int len = run_before[idx][run].i_size;
+ size += len;
+ bits <<= len;
+ bits |= run_before[idx][run].i_bits;
+ zeros -= run;
+ }
+ x264_run_before[i] = (bits << 5) + size;
+ }
}
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/cabac-a.asm
^
|
@@ -35,7 +35,7 @@
; t3 must be ecx, since it's used for shift.
%ifdef WIN64
- DECLARE_REG_TMP 3,1,2,0,4,5,6,2
+ DECLARE_REG_TMP 3,1,2,0,6,5,4,2
%define pointer resq
%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
@@ -61,11 +61,11 @@
%macro LOAD_GLOBAL 4
%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
- lea r11, [%2]
+ lea r7, [%2]
%ifnidn %3, 0
- add r11, %3
+ add r7, %3
%endif
- movzx %1, byte [r11+%4]
+ movzx %1, byte [r7+%4]
%else
movzx %1, byte [%2+%3+%4]
%endif
@@ -81,6 +81,9 @@
and t4d, t6d
shr t5d, 6
movifnidn t2d, r2m
+%ifdef WIN64
+ PUSH r7
+%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
and t6d, 1
@@ -95,6 +98,9 @@
mov t4d, t3d
shr t3d, 3
LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
+%ifdef WIN64
+ POP r7
+%endif
shl t4d, t3b
shl t6d, t3b
mov [t0+cb.range], t4d
@@ -144,12 +150,11 @@
PROLOGUE 0,7
mov t3d, [t0+cb.queue]
mov t6d, [t0+cb.low]
- jmp cabac_putbyte
cabac_putbyte:
; alive: t0=cb t3=queue t6=low
%ifdef WIN64
- DECLARE_REG_TMP 3,4,1,0,2,5,6,10
+ DECLARE_REG_TMP 3,6,1,0,2,5,4
%endif
mov t1d, -1
add t3d, 10
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/const-a.asm
^
|
@@ -38,6 +38,7 @@
const pw_1, times 8 dw 1
const pw_2, times 8 dw 2
+const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
const pw_16, times 8 dw 16
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/dct-32.asm
^
|
@@ -32,120 +32,13 @@
SECTION .text
-%ifndef HIGH_BIT_DEPTH
+cextern pd_32
+cextern pw_pixel_max
+cextern pw_2
+cextern pw_m2
cextern pw_32
cextern hsub_mul
-; in: m0..m7
-; out: 0,4,6 in mem, rest in regs
-%macro DCT8_1D 9
- SUMSUB_BA w, %8, %1 ; %8 = s07, %1 = d07
- SUMSUB_BA w, %7, %2 ; %7 = s16, %2 = d16
- SUMSUB_BA w, %6, %3 ; %6 = s25, %3 = d25
- SUMSUB_BA w, %5, %4 ; %5 = s34, %4 = d34
- SUMSUB_BA w, %5, %8 ; %5 = a0, %8 = a2
- SUMSUB_BA w, %6, %7 ; %6 = a1, %7 = a3
- SUMSUB_BA w, %6, %5 ; %6 = dst0, %5 = dst4
- mova [%9+0x00], m%6
- mova [%9+0x40], m%5
- psraw m%6, m%7, 1 ; a3>>1
- paddw m%6, m%8 ; a2 + (a3>>1)
- psraw m%8, 1 ; a2>>1
- psubw m%8, m%7 ; (a2>>1) - a3
- mova [%9+0x60], m%8
- psraw m%5, m%3, 1
- paddw m%5, m%3 ; d25+(d25>>1)
- psubw m%7, m%1, m%4 ; a5 = d07-d34-(d25+(d25>>1))
- psubw m%7, m%5
- psraw m%5, m%2, 1
- paddw m%5, m%2 ; d16+(d16>>1)
- paddw m%8, m%1, m%4
- psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
- psraw m%5, m%1, 1
- paddw m%5, m%1 ; d07+(d07>>1)
- paddw m%5, m%2
- paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
- psraw m%1, m%4, 1
- paddw m%1, m%4 ; d34+(d34>>1)
- paddw m%1, m%2
- psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
- psraw m%4, m%1, 2
- paddw m%4, m%5 ; a4 + (a7>>2)
- psraw m%3, m%8, 2
- paddw m%3, m%7 ; a5 + (a6>>2)
- psraw m%5, 2
- psraw m%7, 2
- psubw m%5, m%1 ; (a4>>2) - a7
- psubw m%8, m%7 ; a6 - (a5>>2)
- SWAP %2, %4, %3, %6, %8, %5
-%endmacro
-
-; in: 0,4 in mem, rest in regs
-; out: m0..m7
-%macro IDCT8_1D 9
- psraw m%1, m%3, 1
- psraw m%5, m%7, 1
- psubw m%1, m%7
- paddw m%5, m%3
- psraw m%7, m%2, 1
- paddw m%7, m%2
- paddw m%7, m%4
- paddw m%7, m%6
- psraw m%3, m%6, 1
- paddw m%3, m%6
- paddw m%3, m%8
- psubw m%3, m%2
- psubw m%2, m%4
- psubw m%6, m%4
- paddw m%2, m%8
- psubw m%6, m%8
- psraw m%4, 1
- psraw m%8, 1
- psubw m%2, m%4
- psubw m%6, m%8
- psraw m%4, m%7, 2
- psraw m%8, m%3, 2
- paddw m%4, m%6
- paddw m%8, m%2
- psraw m%6, 2
- psraw m%2, 2
- psubw m%7, m%6
- psubw m%2, m%3
- mova m%3, [%9+0x00]
- mova m%6, [%9+0x40]
- SUMSUB_BA w, %6, %3
- SUMSUB_BA w, %5, %6
- SUMSUB_BA w, %1, %3
- SUMSUB_BA w, %7, %5
- SUMSUB_BA w, %2, %1
- SUMSUB_BA w, %8, %3
- SUMSUB_BA w, %4, %6
- SWAP %1, %3
- SWAP %5, %7
- SWAP %1, %5, %6
- SWAP %3, %8, %7
-%endmacro
-
-INIT_MMX
-ALIGN 16
-load_diff_4x8_mmx:
- LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
- LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
- movq [r0], m0
- LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
- LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
- movq m0, [r0]
- ret
-
-cglobal dct8_mmx
- DCT8_1D 0,1,2,3,4,5,6,7,r0
- SAVE_MM_PERMUTATION
- ret
-
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
%xdefine %%base %1
%rep %0/2
@@ -174,6 +67,227 @@
UNSPILL_SHUFFLE %1, %2, %2
%endmacro
+; in: size, m0..m7
+; out: 0,4,6 in memory at %10,%11,%12, rest in regs
+%macro DCT8_1D 12
+ SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07
+ SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16
+ SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25
+ SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34
+ SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2
+ SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3
+ SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4
+ mova %10, m%7
+ mova %11, m%6
+ psra%1 m%7, m%8, 1 ; a3>>1
+ padd%1 m%7, m%9 ; a2 + (a3>>1)
+ psra%1 m%9, 1 ; a2>>1
+ psub%1 m%9, m%8 ; (a2>>1) - a3
+ mova %12, m%9
+ psra%1 m%6, m%4, 1
+ padd%1 m%6, m%4 ; d25+(d25>>1)
+ psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1))
+ psub%1 m%8, m%6
+ psra%1 m%6, m%3, 1
+ padd%1 m%6, m%3 ; d16+(d16>>1)
+ padd%1 m%9, m%2, m%5
+ psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1))
+ psra%1 m%6, m%2, 1
+ padd%1 m%6, m%2 ; d07+(d07>>1)
+ padd%1 m%6, m%3
+ padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1))
+ psra%1 m%2, m%5, 1
+ padd%1 m%2, m%5 ; d34+(d34>>1)
+ padd%1 m%2, m%3
+ psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1))
+ psra%1 m%5, m%2, 2
+ padd%1 m%5, m%6 ; a4 + (a7>>2)
+ psra%1 m%4, m%9, 2
+ padd%1 m%4, m%8 ; a5 + (a6>>2)
+ psra%1 m%6, 2
+ psra%1 m%8, 2
+ psub%1 m%6, m%2 ; (a4>>2) - a7
+ psub%1 m%9, m%8 ; a6 - (a5>>2)
+ SWAP %3, %5, %4, %7, %9, %6
+%endmacro
+
+; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11
+; out: m0..m7
+%macro IDCT8_1D 11
+ psra%1 m%2, m%4, 1
+ psra%1 m%6, m%8, 1
+ psub%1 m%2, m%8
+ padd%1 m%6, m%4
+ psra%1 m%8, m%3, 1
+ padd%1 m%8, m%3
+ padd%1 m%8, m%5
+ padd%1 m%8, m%7
+ psra%1 m%4, m%7, 1
+ padd%1 m%4, m%7
+ padd%1 m%4, m%9
+ psub%1 m%4, m%3
+ psub%1 m%3, m%5
+ psub%1 m%7, m%5
+ padd%1 m%3, m%9
+ psub%1 m%7, m%9
+ psra%1 m%5, 1
+ psra%1 m%9, 1
+ psub%1 m%3, m%5
+ psub%1 m%7, m%9
+ psra%1 m%5, m%8, 2
+ psra%1 m%9, m%4, 2
+ padd%1 m%5, m%7
+ padd%1 m%9, m%3
+ psra%1 m%7, 2
+ psra%1 m%3, 2
+ psub%1 m%8, m%7
+ psub%1 m%3, m%4
+ mova m%4, %10
+ mova m%7, %11
+ SUMSUB_BA %1, %7, %4
+ SUMSUB_BA %1, %6, %7
+ SUMSUB_BA %1, %2, %4
+ SUMSUB_BA %1, %8, %6
+ SUMSUB_BA %1, %3, %2
+ SUMSUB_BA %1, %9, %4
+ SUMSUB_BA %1, %5, %7
+ SWAP %2, %4
+ SWAP %6, %8
+ SWAP %2, %6, %7
+ SWAP %4, %9, %8
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+
+%macro SUB8x8_DCT8 0
+cglobal sub8x8_dct8, 3,3,8
+global current_function %+ .skip_prologue
+.skip_prologue:
+ LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
+ LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
+
+ DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50]
+ mova m0, [r0]
+
+ mova [r0+0x30], m5
+ mova [r0+0x70], m7
+ TRANSPOSE4x4W 0,1,2,3,4
+ WIDEN_SXWD 0,4
+ WIDEN_SXWD 1,5
+ WIDEN_SXWD 2,6
+ WIDEN_SXWD 3,7
+ DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0]
+ mova [r0+0x20], m4
+ mova [r0+0x40], m1
+ mova [r0+0x60], m5
+ mova [r0+0xA0], m6
+ mova [r0+0xE0], m7
+ mova m4, [r0+0x10]
+ mova m5, [r0+0x30]
+ mova m6, [r0+0x50]
+ mova m7, [r0+0x70]
+
+ TRANSPOSE4x4W 4,5,6,7,0
+ WIDEN_SXWD 4,0
+ WIDEN_SXWD 5,1
+ WIDEN_SXWD 6,2
+ WIDEN_SXWD 7,3
+ DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0]
+ mova [r0+0x30], m0
+ mova [r0+0x50], m5
+ mova [r0+0x70], m1
+ mova [r0+0xB0], m2
+ mova [r0+0xF0], m3
+ ret
+%endmacro ; SUB8x8_DCT8
+
+INIT_XMM sse2
+SUB8x8_DCT8
+INIT_XMM sse4
+SUB8x8_DCT8
+INIT_XMM avx
+SUB8x8_DCT8
+
+%macro ADD8x8_IDCT8 0
+cglobal add8x8_idct8, 2,2
+ add r1, 128
+global current_function %+ .skip_prologue
+.skip_prologue:
+ UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6
+ IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0]
+ mova [r1+0], m4
+ TRANSPOSE4x4D 0,1,2,3,4
+ paddd m0, [pd_32]
+ mova m4, [r1+0]
+ SPILL_SHUFFLE r1, 0,1,2,3, -8,-6,-4,-2
+ TRANSPOSE4x4D 4,5,6,7,3
+ paddd m4, [pd_32]
+ SPILL_SHUFFLE r1, 4,5,6,7, 0,2,4,6
+ UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7
+ IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16]
+ mova [r1+16], m4
+ TRANSPOSE4x4D 0,1,2,3,4
+ mova m4, [r1+16]
+ mova [r1-112], m0
+ TRANSPOSE4x4D 4,5,6,7,0
+ SPILL_SHUFFLE r1, 4,5,6,7, 1,3,5,7
+ UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2
+ IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112]
+ SPILL_SHUFFLE r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1
+ UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7
+ IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16]
+ SPILL_SHUFFLE r1, 7,6,5, 7,6,5
+ mova m7, [pw_pixel_max]
+ pxor m6, m6
+ mova m5, [r1-128]
+ STORE_DIFF m5, m0, m6, m7, [r0+0*FDEC_STRIDEB]
+ mova m0, [r1-112]
+ STORE_DIFF m0, m1, m6, m7, [r0+1*FDEC_STRIDEB]
+ mova m0, [r1-96]
+ STORE_DIFF m0, m2, m6, m7, [r0+2*FDEC_STRIDEB]
+ mova m0, [r1-80]
+ STORE_DIFF m0, m3, m6, m7, [r0+3*FDEC_STRIDEB]
+ mova m0, [r1-64]
+ STORE_DIFF m0, m4, m6, m7, [r0+4*FDEC_STRIDEB]
+ mova m0, [r1-48]
+ mova m1, [r1+80]
+ STORE_DIFF m0, m1, m6, m7, [r0+5*FDEC_STRIDEB]
+ mova m0, [r1-32]
+ mova m1, [r1+96]
+ STORE_DIFF m0, m1, m6, m7, [r0+6*FDEC_STRIDEB]
+ mova m0, [r1-16]
+ mova m1, [r1+112]
+ STORE_DIFF m0, m1, m6, m7, [r0+7*FDEC_STRIDEB]
+ RET
+%endmacro ; ADD8x8_IDCT8
+
+INIT_XMM sse2
+ADD8x8_IDCT8
+INIT_XMM avx
+ADD8x8_IDCT8
+
+%else ; !HIGH_BIT_DEPTH
+
+INIT_MMX
+ALIGN 16
+load_diff_4x8_mmx:
+ LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
+ movq [r0], m0
+ LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
+ movq m0, [r0]
+ ret
+
+cglobal dct8_mmx
+ DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
+ SAVE_MM_PERMUTATION
+ ret
+
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
@@ -223,7 +337,7 @@
ret
cglobal idct8_mmx
- IDCT8_1D 0,1,2,3,4,5,6,7,r1
+ IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SAVE_MM_PERMUTATION
ret
@@ -383,11 +497,11 @@
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%endif
- DCT8_1D 0,1,2,3,4,5,6,7,r0
+ DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
UNSPILL r0, 4
- DCT8_1D 0,1,2,3,4,5,6,7,r0
+ DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
SPILL r0, 1,2,3,5,7
ret
%endmacro
@@ -402,6 +516,8 @@
DCT_SUB8
INIT_XMM avx
DCT_SUB8
+INIT_XMM xop
+DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
@@ -456,12 +572,12 @@
global current_function %+ .skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
- IDCT8_1D 0,1,2,3,4,5,6,7,r1
+ IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SPILL r1, 6
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
paddw m0, [pw_32]
SPILL r1, 0
- IDCT8_1D 0,1,2,3,4,5,6,7,r1
+ IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SPILL r1, 6,7
pxor m7, m7
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/dct-64.asm
^
|
@@ -31,109 +31,231 @@
SECTION .text
-%ifndef HIGH_BIT_DEPTH
+cextern pd_32
+cextern pw_pixel_max
+cextern pw_2
+cextern pw_m2
cextern pw_32
cextern hsub_mul
-%macro DCT8_1D 10
- SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
- SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
- SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16
- SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07
-
- SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3
- SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2
-
- psraw m%9, m%1, 1
- paddw m%9, m%1
- paddw m%9, m%2
- paddw m%9, m%3 ; %9=a4
-
- psraw m%10, m%4, 1
- paddw m%10, m%4
- paddw m%10, m%2
- psubw m%10, m%3 ; %10=a7
-
- SUMSUB_BA w, %4, %1
- psubw m%1, m%3
- psubw m%4, m%2
- psraw m%3, 1
- psraw m%2, 1
- psubw m%1, m%3 ; %1=a5
- psubw m%4, m%2 ; %4=a6
-
- psraw m%2, m%10, 2
- paddw m%2, m%9 ; %2=b1
- psraw m%9, 2
- psubw m%9, m%10 ; %9=b7
-
- SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4
-
- psraw m%3, m%7, 1
- paddw m%3, m%8 ; %3=b2
- psraw m%8, 1
- psubw m%8, m%7 ; %8=b6
-
- psraw m%7, m%4, 2
- paddw m%7, m%1 ; %7=b3
- psraw m%1, 2
- psubw m%4, m%1 ; %4=b5
+; in: size, m0..m7, temp, temp
+; out: m0..m7
+%macro DCT8_1D 11
+ SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34
+ SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25
+ SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16
+ SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07
+
+ SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3
+ SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2
+
+ psra%1 m%10, m%2, 1
+ padd%1 m%10, m%2
+ padd%1 m%10, m%3
+ padd%1 m%10, m%4 ; %10=a4
+
+ psra%1 m%11, m%5, 1
+ padd%1 m%11, m%5
+ padd%1 m%11, m%3
+ psub%1 m%11, m%4 ; %11=a7
+
+ SUMSUB_BA %1, %5, %2
+ psub%1 m%2, m%4
+ psub%1 m%5, m%3
+ psra%1 m%4, 1
+ psra%1 m%3, 1
+ psub%1 m%2, m%4 ; %2=a5
+ psub%1 m%5, m%3 ; %5=a6
+
+ psra%1 m%3, m%11, 2
+ padd%1 m%3, m%10 ; %3=b1
+ psra%1 m%10, 2
+ psub%1 m%10, m%11 ; %10=b7
+
+ SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4
+
+ psra%1 m%4, m%8, 1
+ padd%1 m%4, m%9 ; %4=b2
+ psra%1 m%9, 1
+ psub%1 m%9, m%8 ; %9=b6
+
+ psra%1 m%8, m%5, 2
+ padd%1 m%8, m%2 ; %8=b3
+ psra%1 m%2, 2
+ psub%1 m%5, m%2 ; %5=b5
- SWAP %1, %6, %4, %7, %8, %9
+ SWAP %2, %7, %5, %8, %9, %10
%endmacro
-%macro IDCT8_1D 10
- SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2
-
- psraw m%9, m%2, 1
- paddw m%9, m%2
- paddw m%9, m%4
- paddw m%9, m%6 ; %9=a7
-
- psraw m%10, m%3, 1
- psubw m%10, m%7 ; %10=a4
- psraw m%7, 1
- paddw m%7, m%3 ; %7=a6
-
- psraw m%3, m%6, 1
- paddw m%3, m%6
- paddw m%3, m%8
- psubw m%3, m%2 ; %3=a5
-
- psubw m%2, m%4
- psubw m%6, m%4
- paddw m%2, m%8
- psubw m%6, m%8
- psraw m%4, 1
- psraw m%8, 1
- psubw m%2, m%4 ; %2=a3
- psubw m%6, m%8 ; %6=a1
-
- psraw m%4, m%9, 2
- paddw m%4, m%6 ; %4=b1
- psraw m%6, 2
- psubw m%9, m%6 ; %9=b7
-
- SUMSUB_BA w, %7, %5, %6 ; %7=b0, %5=b6
- SUMSUB_BA w, %10, %1, %6 ; %10=b2, %1=b4
-
- psraw m%8, m%3, 2
- paddw m%8, m%2 ; %8=b3
- psraw m%2, 2
- psubw m%2, m%3 ; %2=b5
-
- SUMSUB_BA w, %9, %7, %6 ; %9=c0, %7=c7
- SUMSUB_BA w, %2, %10, %6 ; %2=c1, %10=c6
- SUMSUB_BA w, %8, %1, %6 ; %8=c2, %1=c5
- SUMSUB_BA w, %4, %5, %6 ; %4=c3, %5=c4
-
- SWAP %10, %3
- SWAP %1, %9, %6
- SWAP %3, %8, %7
+%macro IDCT8_1D 11
+ SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
+
+ psra%1 m%10, m%3, 1
+ padd%1 m%10, m%3
+ padd%1 m%10, m%5
+ padd%1 m%10, m%7 ; %9=a7
+
+ psra%1 m%11, m%4, 1
+ psub%1 m%11, m%8 ; %10=a4
+ psra%1 m%8, 1
+ padd%1 m%8, m%4 ; %7=a6
+
+ psra%1 m%4, m%7, 1
+ padd%1 m%4, m%7
+ padd%1 m%4, m%9
+ psub%1 m%4, m%3 ; %3=a5
+
+ psub%1 m%3, m%5
+ psub%1 m%7, m%5
+ padd%1 m%3, m%9
+ psub%1 m%7, m%9
+ psra%1 m%5, 1
+ psra%1 m%9, 1
+ psub%1 m%3, m%5 ; %2=a3
+ psub%1 m%7, m%9 ; %6=a1
+
+ psra%1 m%5, m%10, 2
+ padd%1 m%5, m%7 ; %4=b1
+ psra%1 m%7, 2
+ psub%1 m%10, m%7 ; %9=b7
+
+ SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6
+ SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4
+
+ psra%1 m%9, m%4, 2
+ padd%1 m%9, m%3 ; %8=b3
+ psra%1 m%3, 2
+ psub%1 m%3, m%4 ; %2=b5
+
+ SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7
+ SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6
+ SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5
+ SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4
+
+ SWAP %11, %4
+ SWAP %2, %10, %7
+ SWAP %4, %9, %8
%endmacro
+%ifdef HIGH_BIT_DEPTH
+
+%macro SUB8x8_DCT8 0
+cglobal sub8x8_dct8, 3,3,14
+%ifdef WIN64
+ call .skip_prologue
+ RET
+%endif
+global current_function %+ .skip_prologue
+.skip_prologue:
+ LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
+ LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
+
+ DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9
+
+ TRANSPOSE4x4W 0,1,2,3,8
+ WIDEN_SXWD 0,8
+ WIDEN_SXWD 1,9
+ WIDEN_SXWD 2,10
+ WIDEN_SXWD 3,11
+ DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13
+ mova [r0+0x00], m0
+ mova [r0+0x20], m8
+ mova [r0+0x40], m1
+ mova [r0+0x60], m9
+ mova [r0+0x80], m2
+ mova [r0+0xA0], m10
+ mova [r0+0xC0], m3
+ mova [r0+0xE0], m11
+
+ TRANSPOSE4x4W 4,5,6,7,0
+ WIDEN_SXWD 4,0
+ WIDEN_SXWD 5,1
+ WIDEN_SXWD 6,2
+ WIDEN_SXWD 7,3
+ DCT8_1D d,4,0,5,1,6,2,7,3, 8,9
+ mova [r0+0x10], m4
+ mova [r0+0x30], m0
+ mova [r0+0x50], m5
+ mova [r0+0x70], m1
+ mova [r0+0x90], m6
+ mova [r0+0xB0], m2
+ mova [r0+0xD0], m7
+ mova [r0+0xF0], m3
+ ret
+%endmacro ; SUB8x8_DCT8
+
+INIT_XMM sse2
+SUB8x8_DCT8
+INIT_XMM sse4
+SUB8x8_DCT8
+INIT_XMM avx
+SUB8x8_DCT8
+
+%macro ADD8x8_IDCT8 0
+cglobal add8x8_idct8, 2,2,16
+ add r1, 128
+%ifdef WIN64
+ call .skip_prologue
+ RET
+%endif
+global current_function %+ .skip_prologue
+.skip_prologue:
+ mova m0, [r1-128]
+ mova m1, [r1-96]
+ mova m2, [r1-64]
+ mova m3, [r1-32]
+ mova m4, [r1+ 0]
+ mova m5, [r1+32]
+ mova m6, [r1+64]
+ mova m7, [r1+96]
+ IDCT8_1D d,0,1,2,3,4,5,6,7,8,9
+ TRANSPOSE4x4D 0,1,2,3,8
+ TRANSPOSE4x4D 4,5,6,7,8
+ paddd m0, [pd_32]
+ paddd m4, [pd_32]
+ mova [r1+64], m6
+ mova [r1+96], m7
+ mova m8, [r1-112]
+ mova m9, [r1-80]
+ mova m10, [r1-48]
+ mova m11, [r1-16]
+ mova m12, [r1+16]
+ mova m13, [r1+48]
+ mova m14, [r1+80]
+ mova m15, [r1+112]
+ IDCT8_1D d,8,9,10,11,12,13,14,15,6,7
+ TRANSPOSE4x4D 8,9,10,11,6
+ TRANSPOSE4x4D 12,13,14,15,6
+ IDCT8_1D d,0,1,2,3,8,9,10,11,6,7
+ mova [r1-112], m8
+ mova [r1-80], m9
+ mova m6, [r1+64]
+ mova m7, [r1+96]
+ IDCT8_1D d,4,5,6,7,12,13,14,15,8,9
+ pxor m8, m8
+ mova m9, [pw_pixel_max]
+ STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB]
+ STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB]
+ STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB]
+ STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB]
+ mova m0, [r1-112]
+ mova m1, [r1-80]
+ STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB]
+ STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB]
+ STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB]
+ STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB]
+ ret
+%endmacro ; ADD8x8_IDCT8
+
+INIT_XMM sse2
+ADD8x8_IDCT8
+INIT_XMM avx
+ADD8x8_IDCT8
+
+%else ; !HIGH_BIT_DEPTH
+
%macro DCT_SUB8 0
-cglobal sub8x8_dct, 3,3,11
+cglobal sub8x8_dct, 3,3,10
add r2, 4*FDEC_STRIDE
%if cpuflag(ssse3)
mova m7, [hsub_mul]
@@ -174,9 +296,9 @@
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
- DCT8_1D 0,1,2,3,4,5,6,7,8,9
+ DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
- DCT8_1D 0,1,2,3,4,5,6,7,8,9
+ DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
movdqa [r0+0x00], m0
movdqa [r0+0x10], m1
movdqa [r0+0x20], m2
@@ -198,6 +320,8 @@
DCT_SUB8
INIT_XMM avx
DCT_SUB8
+INIT_XMM xop
+DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
@@ -221,10 +345,10 @@
movdqa m5, [r1+0x50]
movdqa m6, [r1+0x60]
movdqa m7, [r1+0x70]
- IDCT8_1D 0,1,2,3,4,5,6,7,8,10
+ IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
paddw m0, [pw_32] ; rounding for the >>6 at the end
- IDCT8_1D 0,1,2,3,4,5,6,7,8,10
+ IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/dct-a.asm
^
|
@@ -31,6 +31,7 @@
%include "x86util.asm"
SECTION_RODATA
+pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
@@ -52,6 +53,8 @@
cextern pw_1
cextern pd_1
cextern pd_32
+cextern pw_ppppmmmm
+cextern pw_pmpmpmpm
%macro WALSH4_1D 6
SUMSUB_BADC %1, %5, %4, %3, %2, %6
@@ -352,8 +355,8 @@
;-----------------------------------------------------------------------------
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-%macro SUB_NxN_DCT 6
-cglobal %1, 3,3,11
+%macro SUB_NxN_DCT 7
+cglobal %1, 3,3,%7
%ifndef HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
@@ -363,9 +366,6 @@
%endif
%endif ; !HIGH_BIT_DEPTH
.skip_prologue:
-%ifdef WIN64
- sub rsp, 8
-%endif
call %2.skip_prologue
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
@@ -380,7 +380,6 @@
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
call %2.skip_prologue
- add rsp, 8
RET
%else
jmp %2.skip_prologue
@@ -392,18 +391,18 @@
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
%ifdef HIGH_BIT_DEPTH
-cglobal %1, 2,2,6
+cglobal %1, 2,2,%7
+%if %3==256
+ add r1, 128
+%endif
%else
cglobal %1, 2,2,11
pxor m7, m7
%endif
-%if mmsize==16
+%if mmsize==16 && %3!=256
add r0, 4*FDEC_STRIDE
%endif
.skip_prologue:
-%ifdef WIN64
- sub rsp, 8
-%endif
call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
@@ -415,7 +414,6 @@
add r1, %3
%ifdef WIN64
call %2.skip_prologue
- add rsp, 8
RET
%else
jmp %2.skip_prologue
@@ -424,24 +422,34 @@
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
INIT_XMM
-ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8
-ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8
+ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
+ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
+ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
+cextern add8x8_idct8_sse2.skip_prologue
+cextern add8x8_idct8_avx.skip_prologue
+ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
+ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
+cextern sub8x8_dct8_sse2.skip_prologue
+cextern sub8x8_dct8_sse4.skip_prologue
+cextern sub8x8_dct8_avx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
+SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
+SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
INIT_MMX
-SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
-SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
%endif
@@ -449,9 +457,11 @@
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
cextern sub8x8_dct_avx.skip_prologue
-SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0
+cextern sub8x8_dct_xop.skip_prologue
+SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
+SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
+SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
+SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
cextern add8x8_idct_sse2.skip_prologue
cextern add8x8_idct_avx.skip_prologue
@@ -466,9 +476,9 @@
cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_ssse3.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
-SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
+SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
+SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
@@ -727,11 +737,11 @@
; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-%macro DCTDC_2ROW_MMX 3
+%macro DCTDC_2ROW_MMX 4
movq %1, [r1+FENC_STRIDE*(0+%3)]
movq m1, [r1+FENC_STRIDE*(1+%3)]
- movq m2, [r2+FDEC_STRIDE*(0+%3)]
- movq m3, [r2+FDEC_STRIDE*(1+%3)]
+ movq m2, [r2+FDEC_STRIDE*(0+%4)]
+ movq m3, [r2+FDEC_STRIDE*(1+%4)]
movq %2, %1
punpckldq %1, m1
punpckhdq %2, m1
@@ -747,30 +757,29 @@
psubw %2, m1
%endmacro
-%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
- pshufw mm1, %1, q2200 ; s1 s1 s0 s0
- pshufw mm0, %2, q2301 ; s3 __ s2 __
- paddw mm1, %2 ; s1 s13 s0 s02
- psubw mm1, mm0 ; d13 s13 d02 s02
- pshufw mm0, mm1, q1010 ; d02 s02 d02 s02
- psrlq mm1, 32 ; __ __ d13 s13
- paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
- psllq mm1, 32 ; d13 s13
- psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
+%macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
+ PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0
+ PSHUFLW m0, %2, q2301 ; s3 __ s2 __
+ paddw m1, %2 ; s1 s13 s0 s02
+ psubw m1, m0 ; d13 s13 d02 s02
+ PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02
+ psrlq m1, 32 ; __ __ d13 s13
+ paddw m0, m1 ; d02 s02 d02+d13 s02+s13
+ psllq m1, 32 ; d13 s13
+ psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
%endmacro
%ifndef HIGH_BIT_DEPTH
INIT_MMX
cglobal sub8x8_dct_dc_mmx2, 3,3
- DCTDC_2ROW_MMX m0, m4, 0
- DCTDC_2ROW_MMX m5, m6, 2
+ DCTDC_2ROW_MMX m0, m4, 0, 0
+ DCTDC_2ROW_MMX m5, m6, 2, 2
paddw m0, m5
paddw m4, m6
punpckldq m0, m4
- add r1, FENC_STRIDE*4
add r2, FDEC_STRIDE*4
- DCTDC_2ROW_MMX m7, m4, 0
- DCTDC_2ROW_MMX m5, m6, 2
+ DCTDC_2ROW_MMX m7, m4, 4, 0
+ DCTDC_2ROW_MMX m5, m6, 6, 2
paddw m7, m5
paddw m4, m6
punpckldq m7, m4
@@ -779,43 +788,151 @@
ret
INIT_XMM
-%macro DCTDC_2ROW_SSE2 3
- movq m0, [r1+FENC_STRIDE*(0+%1)]
- movq m1, [r1+FENC_STRIDE*(1+%1)]
- movq m2, [r2+FDEC_STRIDE*(0+%1)]
- movq m3, [r2+FDEC_STRIDE*(1+%1)]
- punpckldq m0, m1
- punpckldq m2, m3
- psadbw m0, m7
- psadbw m2, m7
-%if %2
- paddw %3, m0
- paddw m6, m2
+%macro DCTDC_2ROW_SSE2 4
+ movq m1, [r1+FENC_STRIDE*(0+%1)]
+ movq m2, [r1+FENC_STRIDE*(1+%1)]
+ punpckldq m1, m2
+ movq m2, [r2+FDEC_STRIDE*(0+%2)]
+ punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
+ psadbw m1, m0
+ psadbw m2, m0
+%if %3
+ paddd %4, m1
+ psubd %4, m2
%else
- SWAP %3, m0
- SWAP m6, m2
+ psubd m1, m2
+ SWAP %4, m1
%endif
%endmacro
-cglobal sub8x8_dct_dc_sse2, 3,3,8
- pxor m7, m7
- DCTDC_2ROW_SSE2 0, 0, m4
- DCTDC_2ROW_SSE2 2, 1, m4
- add r1, FENC_STRIDE*4
+cglobal sub8x8_dct_dc_sse2, 3,3
+ pxor m0, m0
+ DCTDC_2ROW_SSE2 0, 0, 0, m3
+ DCTDC_2ROW_SSE2 2, 2, 1, m3
add r2, FDEC_STRIDE*4
- psubd m4, m6
- DCTDC_2ROW_SSE2 0, 0, m5
- DCTDC_2ROW_SSE2 2, 1, m5
- psubd m5, m6
- packssdw m4, m5
- movhlps m5, m4
- movdq2q mm0, m4
- movdq2q mm7, m5
- DCT2x2 mm0, mm7
- movq [r0], mm0
+ DCTDC_2ROW_SSE2 4, 0, 0, m4
+ DCTDC_2ROW_SSE2 6, 2, 1, m4
+ packssdw m3, m3
+ packssdw m4, m4
+ DCT2x2 m3, m4
+ movq [r0], m0
+ RET
+
+%macro SUB8x16_DCT_DC 0
+cglobal sub8x16_dct_dc, 3,3
+ pxor m0, m0
+ DCTDC_2ROW_SSE2 0, 0, 0, m3
+ DCTDC_2ROW_SSE2 2, 2, 1, m3
+ add r1, FENC_STRIDE*8
+ add r2, FDEC_STRIDE*8
+ DCTDC_2ROW_SSE2 -4, -4, 0, m4
+ DCTDC_2ROW_SSE2 -2, -2, 1, m4
+ shufps m3, m4, q2020
+ DCTDC_2ROW_SSE2 0, 0, 0, m5
+ DCTDC_2ROW_SSE2 2, 2, 1, m5
+ add r2, FDEC_STRIDE*4
+ DCTDC_2ROW_SSE2 4, 0, 0, m4
+ DCTDC_2ROW_SSE2 6, 2, 1, m4
+ shufps m5, m4, q2020
+%if cpuflag(ssse3)
+ %define %%sign psignw
+%else
+ %define %%sign pmullw
+%endif
+ SUMSUB_BA d, 5, 3, 0
+ packssdw m5, m3
+ pshuflw m0, m5, q2301
+ pshufhw m0, m0, q2301
+ %%sign m5, [pw_pmpmpmpm]
+ paddw m0, m5
+ pshufd m1, m0, q1320
+ pshufd m0, m0, q0231
+ %%sign m1, [pw_ppppmmmm]
+ paddw m0, m1
+ mova [r0], m0
RET
+%endmacro ; SUB8x16_DCT_DC
+
+INIT_XMM sse2
+SUB8x16_DCT_DC
+INIT_XMM ssse3
+SUB8x16_DCT_DC
+
%endif ; !HIGH_BIT_DEPTH
+%macro DCTDC_4ROW_SSE2 2
+ mova %1, [r1+FENC_STRIDEB*%2]
+ mova m0, [r2+FDEC_STRIDEB*%2]
+%assign Y (%2+1)
+%rep 3
+ paddw %1, [r1+FENC_STRIDEB*Y]
+ paddw m0, [r2+FDEC_STRIDEB*Y]
+%assign Y (Y+1)
+%endrep
+ psubw %1, m0
+ pshufd m0, %1, q2301
+ paddw %1, m0
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+%macro SUB8x8_DCT_DC_10 0
+cglobal sub8x8_dct_dc, 3,3,3
+ DCTDC_4ROW_SSE2 m1, 0
+ DCTDC_4ROW_SSE2 m2, 4
+ mova m0, [pw_ppmmmmpp]
+ pmaddwd m1, m0
+ pmaddwd m2, m0
+ pshufd m0, m1, q2200 ; -1 -1 +0 +0
+ pshufd m1, m1, q0033 ; +0 +0 +1 +1
+ paddd m1, m0
+ pshufd m0, m2, q1023 ; -2 +2 -3 +3
+ paddd m1, m2
+ paddd m1, m0
+ mova [r0], m1
+ RET
+%endmacro
+INIT_XMM sse2
+SUB8x8_DCT_DC_10
+
+%macro SUB8x16_DCT_DC_10 0
+cglobal sub8x16_dct_dc, 3,3,6
+ DCTDC_4ROW_SSE2 m1, 0
+ DCTDC_4ROW_SSE2 m2, 4
+ DCTDC_4ROW_SSE2 m3, 8
+ DCTDC_4ROW_SSE2 m4, 12
+ mova m0, [pw_ppmmmmpp]
+ pmaddwd m1, m0
+ pmaddwd m2, m0
+ pshufd m5, m1, q2200 ; -1 -1 +0 +0
+ pshufd m1, m1, q0033 ; +0 +0 +1 +1
+ paddd m1, m5
+ pshufd m5, m2, q1023 ; -2 +2 -3 +3
+ paddd m1, m2
+ paddd m1, m5 ; a6 a2 a4 a0
+ pmaddwd m3, m0
+ pmaddwd m4, m0
+ pshufd m5, m3, q2200
+ pshufd m3, m3, q0033
+ paddd m3, m5
+ pshufd m5, m4, q1023
+ paddd m3, m4
+ paddd m3, m5 ; a7 a3 a5 a1
+ paddd m0, m1, m3
+ psubd m1, m3
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ punpcklqdq m2, m0, m1
+ punpckhqdq m1, m0
+ mova [r0+ 0], m2
+ mova [r0+16], m1
+ RET
+%endmacro
+INIT_XMM sse2
+SUB8x16_DCT_DC_10
+INIT_XMM avx
+SUB8x16_DCT_DC_10
+%endif
+
;-----------------------------------------------------------------------------
; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
@@ -1327,15 +1444,9 @@
mova [r0+(%1+64)*SIZEOF_PIXEL], m2
mova [r0+(%1+96)*SIZEOF_PIXEL], m3
packsswb m0, m1
-%if %1
- por m6, m2
- por m7, m3
- por m5, m0
-%else
- SWAP 5, 0
- SWAP 6, 2
- SWAP 7, 3
-%endif
+ ACCUM por, 6, 2, %1
+ ACCUM por, 7, 3, %1
+ ACCUM por, 5, 0, %1
%endmacro
%macro ZIGZAG_8x8_CAVLC 1
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/dct.h
^
|
@@ -38,8 +38,13 @@
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
@@ -69,20 +74,22 @@
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_sse2 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_avx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_avx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t *pix1, uint16_t *pix2 );
+void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
+void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
-void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct [64] );
-void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
-void x264_add8x8_idct8_avx ( uint8_t *dst, int16_t dct [64] );
-void x264_add16x16_idct8_avx ( uint8_t *dst, int16_t dct[4][64] );
+void x264_add8x8_idct8_sse2 ( pixel *dst, dctcoef dct [64] );
+void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
+void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
+void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/deblock-a.asm
^
|
@@ -1138,28 +1138,28 @@
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
-cglobal deblock_h_luma, 5,7
- movsxd r10, r1d
- lea r11, [r10+r10*2]
- lea r6, [r0-4]
- lea r5, [r0-4+r11]
+cglobal deblock_h_luma, 5,9
+ movsxd r7, r1d
+ lea r8, [r7*3]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
%ifdef WIN64
- sub rsp, 0x98
+ sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
- sub rsp, 0x68
+ sub rsp, 0x68
%define pix_tmp rsp
%endif
; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
+ ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%ifdef WIN64
@@ -1174,17 +1174,17 @@
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%ifdef WIN64
add rsp, 0x98
@@ -1516,33 +1516,33 @@
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7
- movsxd r10, r1d
- lea r11, [r10*3]
- lea r6, [r0-4]
- lea r5, [r0-4+r11]
- sub rsp, 0x88
+cglobal deblock_h_luma_intra, 4,9
+ movsxd r7, r1d
+ lea r8, [r7*3]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
+ sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call deblock_v_luma_intra
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
- lea r5, [r6+r11]
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
- add rsp, 0x88
+ lea r5, [r6+r8]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra, 2,4
@@ -1675,7 +1675,6 @@
%macro DEBLOCK_CHROMA 0
cglobal deblock_inter_body
- RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
@@ -1726,7 +1725,6 @@
cglobal deblock_intra_body
- RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
@@ -1770,7 +1768,107 @@
dec r4
jg .loop
REP_RET
-%endmacro
+
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_intra_mbaff( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+ add r1, r1
+%if mmsize == 8
+ mov r4, 16/mmsize
+.loop:
+%else
+ lea r5, [r1*3]
+%endif
+ CHROMA_H_LOAD r5
+ LOAD_AB m4, m5, r2, r3
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r5
+%if mmsize == 8
+ lea r0, [r0+r1*(mmsize/4)]
+ dec r4
+ jg .loop
+%endif
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_mbaff( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_mbaff, 5,7,8
+ add r1, r1
+ lea r6, [r1*3]
+%if mmsize == 8
+ mov r5, 16/mmsize
+.loop:
+%endif
+ CHROMA_H_LOAD r6
+ LOAD_AB m4, m5, r2, r3
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ movd m6, [r4]
+ punpcklbw m6, m6
+ psraw m6, 8
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r6
+%if mmsize == 8
+ lea r0, [r0+r1*(mmsize/4)]
+ add r4, mmsize/4
+ dec r5
+ jg .loop
+%endif
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_422_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_422_intra, 4,6,8
+ add r1, r1
+ mov r4, 64/mmsize
+%if mmsize == 16
+ lea r5, [r1*3]
+%endif
+.loop:
+ CHROMA_H_LOAD r5
+ call deblock_intra_body
+ CHROMA_H_STORE r5
+ lea r0, [r0+r1*(mmsize/4)]
+ dec r4
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_422( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_422, 5,7,8
+ add r1, r1
+ mov r5, 64/mmsize
+ lea r6, [r1*3]
+.loop:
+ CHROMA_H_LOAD r6
+ LOAD_AB m4, m5, r2m, r3
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ pxor m4, m4
+ movd m6, [r4-1]
+ psraw m6, 8
+ SPLATW m6, m6
+ pmaxsw m6, m4
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r6
+ lea r0, [r0+r1*(mmsize/4)]
+%if mmsize == 16
+ inc r4
+%else
+ mov r2, r5
+ and r2, 1
+ add r4, r2 ; increment once every 2 iterations
+%endif
+ dec r5
+ jg .loop
+ REP_RET
+%endmacro ; DEBLOCK_CHROMA
%ifndef ARCH_X86_64
INIT_MMX mmx2
@@ -1791,7 +1889,7 @@
sub t5, r1
%if mmsize==8
mov dword r0m, 2
-.skip_prologue:
+.loop:
%endif
%endmacro
@@ -1802,10 +1900,6 @@
lea t6, [r1*3]
mov t5, r0
add r0, t6
-%if mmsize==8
- mov dword r0m, 2
-.skip_prologue:
-%endif
%endmacro
%macro CHROMA_V_LOOP 1
@@ -1816,7 +1910,7 @@
add r4, 2
%endif
dec dword r0m
- jg .skip_prologue
+ jg .loop
%endif
%endmacro
@@ -1828,7 +1922,7 @@
add r4, 2
%endif
dec dword r0m
- jg .skip_prologue
+ jg .loop
%endif
%endmacro
@@ -1865,6 +1959,10 @@
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
CHROMA_H_START
+%if mmsize==8
+ mov dword r0m, 2
+.loop:
+%endif
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_inter_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
@@ -1881,21 +1979,44 @@
DEBLOCK_CHROMA
%endif
-%macro DEBLOCK_H_CHROMA_422 0
-cglobal deblock_h_chroma_422, 5,7,8
-%ifdef ARCH_X86_64
- %define cntr r11
-%else
- %define cntr dword r0m
-%endif
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_mbaff( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+%macro DEBLOCK_H_CHROMA_420_MBAFF 0
+cglobal deblock_h_chroma_mbaff, 5,7,8
dec r2d
dec r3d
sub r0, 4
lea t6, [r1*3]
mov t5, r0
add r0, t6
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ RET
+%endmacro
+
+INIT_XMM sse2
+DEBLOCK_H_CHROMA_420_MBAFF
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+DEBLOCK_H_CHROMA_420_MBAFF
+%endif
+
+%macro DEBLOCK_H_CHROMA_422 0
+cglobal deblock_h_chroma_422, 5,8,8
+%ifdef ARCH_X86_64
+ %define cntr r7
+%else
+ %define cntr dword r0m
+%endif
+ CHROMA_H_START
mov cntr, 32/mmsize
-.skip_prologue:
+.loop:
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
@@ -1913,7 +2034,7 @@
lea t5, [t5+r1*(mmsize/2)]
add r4, mmsize/8
dec cntr
- jg .skip_prologue
+ jg .loop
REP_RET
%endmacro
@@ -1937,7 +2058,7 @@
%define t5 r4
%define t6 r5
-%macro DEBLOCK_CHROMA_INTRA 0
+%macro DEBLOCK_CHROMA_INTRA_BODY 0
cglobal chroma_intra_body
LOAD_MASK r2d, r3d
mova m5, m1
@@ -1951,7 +2072,9 @@
paddb m1, m5
paddb m2, m6
ret
+%endmacro
+%macro DEBLOCK_CHROMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
@@ -1972,21 +2095,52 @@
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
CHROMA_H_START
+%if mmsize==8
+ mov dword r0m, 2
+.loop:
+%endif
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
CHROMA_H_LOOP 0
RET
+
+cglobal deblock_h_chroma_422_intra, 4,7,8
+ CHROMA_H_START
+ mov r6d, 32/mmsize
+.loop:
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ call chroma_intra_body
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ lea r0, [r0+r1*(mmsize/2)]
+ lea t5, [t5+r1*(mmsize/2)]
+ dec r6d
+ jg .loop
+ REP_RET
%endmacro ; DEBLOCK_CHROMA_INTRA
INIT_XMM sse2
+DEBLOCK_CHROMA_INTRA_BODY
DEBLOCK_CHROMA_INTRA
INIT_XMM avx
+DEBLOCK_CHROMA_INTRA_BODY
DEBLOCK_CHROMA_INTRA
-%ifndef ARCH_X86_64
INIT_MMX mmx2
+DEBLOCK_CHROMA_INTRA_BODY
+%ifndef ARCH_X86_64
DEBLOCK_CHROMA_INTRA
%endif
+
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_intra_mbaff( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+ CHROMA_H_START
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ call chroma_intra_body
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ RET
%endif ; !HIGH_BIT_DEPTH
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/mc-a.asm
^
|
@@ -58,13 +58,16 @@
; implicit weighted biprediction
;=============================================================================
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-%ifdef ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,4,5,10,11
- %macro AVG_START 0-1 0
- PROLOGUE 6,7,%1
%ifdef WIN64
- movsxd r5, r5d
-%endif
+ DECLARE_REG_TMP 0,1,2,3,4,5,4,5
+ %macro AVG_START 0-1 0
+ PROLOGUE 5,7,%1
+ movsxd r5, dword r5m
+ %endmacro
+%elifdef UNIX64
+ DECLARE_REG_TMP 0,1,2,3,4,5,7,8
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,9,%1
%endmacro
%else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
@@ -1157,7 +1160,9 @@
jg avg_w16_align%1_%2_ssse3
ret
%if %1==0
- times 13 db 0x90 ; make sure the first ones don't end up short
+ ; make sure the first ones don't end up short
+ ALIGN 16
+ times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
%endif
%endmacro
@@ -1171,7 +1176,7 @@
and eax, 7
jz x264_pixel_avg2_w16_sse2
%endif
- PROLOGUE 6, 7
+ PROLOGUE 6, 8
lea r6, [r4+r2]
and r4, ~0xf
and r6, 0x1f
@@ -1181,8 +1186,8 @@
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
- lea r11, [avg_w16_addr]
- add r6, r11
+ lea r7, [avg_w16_addr]
+ add r6, r7
%else
lea r6, [avg_w16_addr + r6]
%endif
@@ -1393,17 +1398,22 @@
;=============================================================================
%ifdef ARCH_X86_64
- DECLARE_REG_TMP 10,11,6
+ DECLARE_REG_TMP 6,7,8
%else
DECLARE_REG_TMP 0,1,2
%endif
-%macro MC_CHROMA_START 0
+%macro MC_CHROMA_START 1
+%ifdef ARCH_X86_64
+ PROLOGUE 0,9,%1
+%else
+ PROLOGUE 0,6,%1
+%endif
movifnidn r3, r3mp
movifnidn r4d, r4m
movifnidn r5d, r5m
- movifnidn t2d, r6m
- mov t0d, t2d
+ movifnidn t0d, r6m
+ mov t2d, t0d
mov t1d, r5d
sar t0d, 3
sar t1d, 3
@@ -1447,8 +1457,8 @@
; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 0
-cglobal mc_chroma, 0,6
- MC_CHROMA_START
+cglobal mc_chroma
+ MC_CHROMA_START 0
FIX_STRIDES r4
and r5d, 7
%ifdef ARCH_X86_64
@@ -1726,8 +1736,8 @@
movifnidn r5d, r8m
cmp dword r7m, 4
jg .mc1d_w8
- mov r10, r2
- mov r11, r4
+ mov r7, r2
+ mov r8, r4
%if mmsize!=8
shr r5d, 1
%endif
@@ -1741,7 +1751,7 @@
%else
movu m0, [r3]
movu m1, [r3+r6]
- add r3, r11
+ add r3, r8
movu m2, [r3]
movu m3, [r3+r6]
%endif
@@ -1757,7 +1767,7 @@
movq m0, [r3]
movq m1, [r3+r6]
%if mmsize!=8
- add r3, r11
+ add r3, r8
movhps m0, [r3]
movhps m1, [r3+r6]
%endif
@@ -1778,22 +1788,22 @@
psrlw m2, 3
%ifdef HIGH_BIT_DEPTH
%if mmsize == 8
- xchg r4, r11
- xchg r2, r10
+ xchg r4, r8
+ xchg r2, r7
%endif
movq [r0], m0
movq [r1], m2
%if mmsize == 16
- add r0, r10
- add r1, r10
+ add r0, r7
+ add r1, r7
movhps [r0], m0
movhps [r1], m2
%endif
%else ; !HIGH_BIT_DEPTH
packuswb m0, m2
%if mmsize==8
- xchg r4, r11
- xchg r2, r10
+ xchg r4, r8
+ xchg r2, r7
movd [r0], m0
psrlq m0, 32
movd [r1], m0
@@ -1801,8 +1811,8 @@
movhlps m1, m0
movd [r0], m0
movd [r1], m1
- add r0, r10
- add r1, r10
+ add r0, r7
+ add r1, r7
psrldq m0, 4
psrldq m1, 4
movd [r0], m0
@@ -1818,8 +1828,8 @@
.mc1d_w8:
sub r2, 4*SIZEOF_PIXEL
sub r4, 8*SIZEOF_PIXEL
- mov r10, 4*SIZEOF_PIXEL
- mov r11, 8*SIZEOF_PIXEL
+ mov r7, 4*SIZEOF_PIXEL
+ mov r8, 8*SIZEOF_PIXEL
%if mmsize==8
shl r5d, 1
%endif
@@ -1827,10 +1837,9 @@
%endif ; ARCH_X86_64
%endmacro ; MC_CHROMA
-
%macro MC_CHROMA_SSSE3 0
-cglobal mc_chroma, 0,6,9
- MC_CHROMA_START
+cglobal mc_chroma
+ MC_CHROMA_START 9
and r5d, 7
and t2d, 7
mov t0d, r5d
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/mc-a2.asm
^
|
@@ -660,7 +660,7 @@
mova %1, m1
mova %2, m4
FILT_PACK m1, m4, 5, m15
- movntps [r11+r4+%5], m1
+ movntps [r8+r4+%5], m1
%endmacro
%macro FILT_C 4
@@ -728,26 +728,26 @@
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
-cglobal hpel_filter, 7,7,16
+cglobal hpel_filter, 7,9,16
%ifdef WIN64
movsxd r4, r4d
movsxd r5, r5d
%endif
- mov r10, r3
+ mov r7, r3
sub r5, 16
- mov r11, r1
- and r10, 15
- sub r3, r10
+ mov r8, r1
+ and r7, 15
+ sub r3, r7
add r0, r5
- add r11, r5
- add r10, r5
+ add r8, r5
+ add r7, r5
add r5, r2
mov r2, r4
- neg r10
+ neg r7
lea r1, [r3+r2]
sub r3, r2
sub r3, r2
- mov r4, r10
+ mov r4, r7
mova m15, [pw_16]
%if cpuflag(ssse3)
mova m0, [filt_mul51]
@@ -774,14 +774,14 @@
cmp r4, 16
jl .lastx
; setup regs for next y
- sub r4, r10
+ sub r4, r7
sub r4, r2
sub r1, r4
sub r3, r4
add r0, r2
- add r11, r2
+ add r8, r2
add r5, r2
- mov r4, r10
+ mov r4, r7
sub r6d, 1
jg .loopy
sfence
@@ -950,7 +950,7 @@
; uint8_t *srcv, int i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core, 7,7
+cglobal plane_copy_interleave_core, 7,9
FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
mov r1m, r1d
@@ -965,7 +965,7 @@
add r2, r6
add r4, r6
%ifdef ARCH_X86_64
- DECLARE_REG_TMP 10,11
+ DECLARE_REG_TMP 7,8
%else
DECLARE_REG_TMP 1,3
%endif
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/pixel-a.asm
^
|
@@ -130,7 +130,7 @@
cextern pw_1
cextern pw_8
cextern pw_16
-cextern pw_64
+cextern pw_32
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
@@ -1267,15 +1267,21 @@
%macro BACKUP_POINTERS 0
%ifdef ARCH_X86_64
- mov r10, r0
- mov r11, r2
+%ifdef WIN64
+ PUSH r7
+%endif
+ mov r6, r0
+ mov r7, r2
%endif
%endmacro
%macro RESTORE_AND_INC_POINTERS 0
%ifdef ARCH_X86_64
- lea r0, [r10+8]
- lea r2, [r11+8]
+ lea r0, [r6+8]
+ lea r2, [r7+8]
+%ifdef WIN64
+ POP r7
+%endif
%else
mov r0, r0mp
mov r2, r2mp
@@ -1473,10 +1479,10 @@
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
- lea r10, [r0+4*r1]
- lea r11, [r2+4*r3]
+ lea r6, [r0+4*r1]
+ lea r7, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
- LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
+ LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
%if vertical
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
@@ -1488,7 +1494,7 @@
SAVE_MM_PERMUTATION
ret
-cglobal pixel_sa8d_8x8, 4,6,12
+cglobal pixel_sa8d_8x8, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
@@ -1506,7 +1512,7 @@
shr eax, 1
RET
-cglobal pixel_sa8d_16x16, 4,6,12
+cglobal pixel_sa8d_16x16, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
@@ -1794,6 +1800,12 @@
INIT_MMX
cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
+%ifdef HIGH_BIT_DEPTH
+ mova m0, [r0+0*FENC_STRIDEB]
+ mova m1, [r0+1*FENC_STRIDEB]
+ mova m2, [r0+2*FENC_STRIDEB]
+ mova m3, [r0+3*FENC_STRIDEB]
+%else
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
movd m1, [r0+1*FENC_STRIDE]
@@ -1803,24 +1815,31 @@
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
+%endif
HADAMARD4_2D 0, 1, 2, 3, 4
SAVE_MM_PERMUTATION
ret
%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
%ifidn %1, top
- movd %3, [r1+%2-FDEC_STRIDE]
+%ifdef HIGH_BIT_DEPTH
+ mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
+%else
+ movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
pxor %5, %5
punpcklbw %3, %5
+%endif
%else ; left
%ifnidn %2, 0
- shl %2d, 5 ; log(FDEC_STRIDE)
+ shl %2d, 5 ; log(FDEC_STRIDEB)
%endif
- movd %3, [r1+%2-4+1*FDEC_STRIDE]
- pinsrw %3, [r1+%2-2+0*FDEC_STRIDE], 0
- pinsrw %3, [r1+%2-2+2*FDEC_STRIDE], 2
- pinsrw %3, [r1+%2-2+3*FDEC_STRIDE], 3
+ movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
+%ifndef HIGH_BIT_DEPTH
psrlw %3, 8
+%endif
%ifnidn %2, 0
shr %2d, 5
%endif
@@ -1859,19 +1878,6 @@
%8 %3, %6
%endmacro
-%macro CLEAR_SUMS 0
-%ifdef ARCH_X86_64
- mov qword [sums+0], 0
- mov qword [sums+8], 0
- mov qword [sums+16], 0
-%else
- pxor m7, m7
- movq [sums+0], m7
- movq [sums+8], m7
- movq [sums+16], m7
-%endif
-%endmacro
-
; in: m1..m3
; out: m7
; clobber: m4..m6
@@ -1942,45 +1948,47 @@
%endif
RET
-%ifdef ARCH_X86_64
- %define t0 r10
- %define t2 r11
-%else
- %define t0 r0
- %define t2 r2
-%endif
-
;-----------------------------------------------------------------------------
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_16x16, 0,5
- %assign stack_pad 88 + ((stack_offset+88+gprsize)&15)
+ %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, stack_pad
-%define sums rsp+64 ; size 24
+%define sums rsp+64 ; size 56
%define top_1d rsp+32 ; size 32
%define left_1d rsp ; size 32
movifnidn r1, r1mp
- CLEAR_SUMS
+
+ pxor m7, m7
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
+%ifdef HIGH_BIT_DEPTH
+ mova [sums+24], m7
+ mova [sums+32], m7
+ mova [sums+40], m7
+ mova [sums+48], m7
+%endif
; 1D hadamards
- mov t0d, 12
- movd m6, [pw_64]
+ mov r3d, 12
+ movd m6, [pw_32]
.loop_edge:
- SCALAR_HADAMARD left, t0, m0, m1
- SCALAR_HADAMARD top, t0, m1, m2, m3
- paddw m6, m0
- paddw m6, m1
- sub t0d, 4
+ SCALAR_HADAMARD left, r3, m0, m1
+ SCALAR_HADAMARD top, r3, m1, m2, m3
+ pavgw m0, m1
+ paddw m6, m0
+ sub r3d, 4
jge .loop_edge
- psrlw m6, 3
- pand m6, [sw_f0] ; dc
+ psrlw m6, 2
+ pand m6, [sw_f0] ; dc
; 2D hadamards
- movifnidn r0, r0mp
- mov r3, -4
+ movifnidn r0, r0mp
+ mov r3, -4
.loop_y:
- mov r4, -4
+ mov r4, -4
.loop_x:
call hadamard_load
@@ -1988,38 +1996,74 @@
SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
pavgw m4, m7
pavgw m5, m7
- paddw m0, [sums+0] ; i16x16_v satd
- paddw m4, [sums+8] ; i16x16_h satd
+ paddw m0, [sums+ 0] ; i16x16_v satd
+ paddw m4, [sums+ 8] ; i16x16_h satd
paddw m5, [sums+16] ; i16x16_dc satd
- movq [sums+0], m0
- movq [sums+8], m4
- movq [sums+16], m5
+ mova [sums+ 0], m0
+ mova [sums+ 8], m4
+ mova [sums+16], m5
- add r0, 4
+ add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
- add r0, 4*FENC_STRIDE-16
+%ifdef HIGH_BIT_DEPTH
+ mova m7, [pw_1]
+ pmaddwd m4, m7
+ pmaddwd m0, m7
+ paddd m4, [sums+32]
+ paddd m0, [sums+24]
+ mova [sums+32], m4
+ mova [sums+24], m0
+ pxor m7, m7
+ punpckhwd m3, m5, m7
+ punpcklwd m5, m7
+ paddd m3, [sums+48]
+ paddd m5, [sums+40]
+ mova [sums+48], m3
+ mova [sums+40], m5
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
+%endif
+ add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
inc r3
jl .loop_y
; horizontal sum
movifnidn r2, r2mp
- movq m2, [sums+16]
- movq m1, [sums+8]
- movq m0, [sums+0]
- movq m7, m2
- SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+%ifdef HIGH_BIT_DEPTH
+ mova m1, m5
+ paddd m5, m3
+ HADDD m5, m7 ; DC satd
+ HADDD m4, m7 ; H satd
+ HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
+ psrld m0, 1
+ psrlq m1, 32 ; DC[1]
+ paddd m0, m3 ; DC[2]
+ psrlq m3, 32 ; DC[3]
+ paddd m0, m1
+ paddd m0, m3
+%else
+ mova m7, m5
+ SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
psrld m0, 1
pslld m7, 16
psrld m7, 16
- paddd m0, m2
+ paddd m0, m5
psubd m0, m7
- movd [r2+8], m2 ; i16x16_dc satd
- movd [r2+4], m1 ; i16x16_h satd
- movd [r2+0], m0 ; i16x16_v satd
- ADD rsp, stack_pad
+%endif
+ movd [r2+8], m5 ; i16x16_dc satd
+ movd [r2+4], m4 ; i16x16_h satd
+ movd [r2+0], m0 ; i16x16_v satd
+ ADD rsp, stack_pad
RET
+%ifdef ARCH_X86_64
+ %define t0 r6
+%else
+ %define t0 r2
+%endif
+
;-----------------------------------------------------------------------------
; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
@@ -2031,32 +2075,35 @@
%define top_1d rsp+16 ; size 16
%define left_1d rsp ; size 16
movifnidn r1, r1mp
- CLEAR_SUMS
+ pxor m7, m7
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
; 1D hadamards
- mov t0d, 4
+ mov r3d, 4
.loop_edge:
- SCALAR_HADAMARD left, t0, m0, m1
- SCALAR_HADAMARD top, t0, m0, m1, m2
- sub t0d, 4
+ SCALAR_HADAMARD left, r3, m0, m1
+ SCALAR_HADAMARD top, r3, m0, m1, m2
+ sub r3d, 4
jge .loop_edge
; dc
- movzx t2d, word [left_1d+0]
+ movzx t0d, word [left_1d+0]
movzx r3d, word [top_1d+0]
movzx r4d, word [left_1d+8]
movzx r5d, word [top_1d+8]
- lea t2d, [t2 + r3 + 16]
+ lea t0d, [t0 + r3 + 16]
lea r3d, [r4 + r5 + 16]
- shr t2d, 1
+ shr t0d, 1
shr r3d, 1
add r4d, 8
add r5d, 8
- and t2d, -16 ; tl
+ and t0d, -16 ; tl
and r3d, -16 ; br
and r4d, -16 ; bl
and r5d, -16 ; tr
- mov [dc_1d+ 0], t2d ; tl
+ mov [dc_1d+ 0], t0d ; tl
mov [dc_1d+ 4], r5d ; tr
mov [dc_1d+ 8], r4d ; bl
mov [dc_1d+12], r3d ; br
@@ -2082,10 +2129,10 @@
movq [sums+8], m4
movq [sums+0], m5
- add r0, 4
+ add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
- add r0, 4*FENC_STRIDE-8
+ add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
add r5, 8
inc r3
jl .loop_y
@@ -2095,10 +2142,18 @@
movq m1, [sums+8]
movq m2, [sums+16]
movq m7, m0
+%ifdef HIGH_BIT_DEPTH
+ psrlq m7, 16
+ HADDW m7, m3
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ psrld m2, 1
+ paddd m2, m7
+%else
psrlq m7, 15
paddw m2, m7
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
psrld m2, 1
+%endif
movd [r2+0], m0 ; i8x8c_dc satd
movd [r2+4], m1 ; i8x8c_h satd
movd [r2+8], m2 ; i8x8c_v satd
@@ -3717,9 +3772,9 @@
SATDS_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
+%endif
INIT_MMX mmx2
INTRA_X3_MMX
-%endif
INIT_XMM sse2
HADAMARD_AC_SSE2
@@ -3808,13 +3863,8 @@
pmaddwd m7, m5, m6
pmaddwd m5, m5
pmaddwd m6, m6
-%if %1==0
- SWAP 3, 5
- SWAP 4, 7
-%else
- paddd m3, m5
- paddd m4, m7
-%endif
+ ACCUM paddd, 3, 5, %1
+ ACCUM paddd, 4, 7, %1
paddd m3, m6
%endmacro
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/pixel.h
^
|
@@ -98,6 +98,9 @@
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_avx ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
@@ -112,6 +115,7 @@
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
@@ -153,6 +157,8 @@
int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
+int x264_pixel_vsad_ssse3( pixel *src, int stride, int height );
+int x264_pixel_vsad_xop( pixel *src, int stride, int height );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/predict-a.asm
^
|
@@ -34,6 +34,7 @@
pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
+pw_m7: times 8 dw -7
pb_00s_ff: times 8 db 0
pb_0s_ff: times 7 db 0
db 0xff
@@ -1079,36 +1080,42 @@
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
-INIT_MMX
-cglobal predict_8x8c_p_core_mmx2, 1,2
+%ifndef HIGH_BIT_DEPTH
+%macro PREDICT_CHROMA_P_MMX 1
+cglobal predict_8x%1c_p_core, 1,2
LOAD_PLANE_ARGS
- movq mm1, mm2
- pmullw mm2, [pw_3210]
- psllw mm1, 2
- paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
- paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
-
- mov r1d, 8
+ movq m1, m2
+ pmullw m2, [pw_3210]
+ psllw m1, 2
+ paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
+ paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
+ mov r1d, %1
ALIGN 4
.loop:
- movq mm5, mm0
- movq mm6, mm1
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [r0], mm5
+ movq m5, m0
+ movq m6, m1
+ psraw m5, 5
+ psraw m6, 5
+ packuswb m5, m6
+ movq [r0], m5
- paddsw mm0, mm4
- paddsw mm1, mm4
+ paddsw m0, m4
+ paddsw m1, m4
add r0, FDEC_STRIDE
dec r1d
- jg .loop
+ jg .loop
REP_RET
+%endmacro ; PREDICT_CHROMA_P_MMX
+
+INIT_MMX mmx2
+PREDICT_CHROMA_P_MMX 8
+PREDICT_CHROMA_P_MMX 16
+%endif ; !HIGH_BIT_DEPTH
%endif ; !ARCH_X86_64
-%macro PREDICT_8x8C 0
+%macro PREDICT_CHROMA_P_XMM 1
%ifdef HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core, 1,1,7
+cglobal predict_8x%1c_p_core, 1,2,7
movd m0, r1m
movd m2, r2m
movd m4, r3m
@@ -1118,9 +1125,13 @@
SPLATW m2, m2, 0
SPLATW m4, m4, 0
pmullw m2, [pw_43210123] ; b
- pmullw m5, m4, [pw_m3] ; c
+%if %1 == 16
+ pmullw m5, m4, [pw_m7] ; c
+%else
+ pmullw m5, m4, [pw_m3]
+%endif
paddw m5, [pw_16]
- mov r1d, 8
+ mov r1d, %1
.loop:
paddsw m6, m2, m5
paddsw m6, m0
@@ -1129,11 +1140,11 @@
mova [r0], m6
paddw m5, m4
add r0, FDEC_STRIDEB
- dec r1d
+ dec r1d
jg .loop
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core, 1,1
+cglobal predict_8x%1c_p_core, 1,2
movd m0, r1m
movd m2, r2m
movd m4, r3m
@@ -1144,8 +1155,7 @@
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
paddsw m3, m0, m4
paddsw m4, m4
-call .loop
- add r0, FDEC_STRIDE*4
+ mov r1d, %1/4
.loop:
paddsw m1, m3, m4
paddsw m5, m0, m4
@@ -1161,14 +1171,19 @@
packuswb m5, m1
movq [r0+FDEC_STRIDE*2], m5
movhps [r0+FDEC_STRIDE*3], m5
+ add r0, FDEC_STRIDE*4
+ dec r1d
+ jg .loop
RET
%endif ; HIGH_BIT_DEPTH
-%endmacro
+%endmacro ; PREDICT_CHROMA_P_XMM
INIT_XMM sse2
-PREDICT_8x8C
+PREDICT_CHROMA_P_XMM 8
+PREDICT_CHROMA_P_XMM 16
INIT_XMM avx
-PREDICT_8x8C
+PREDICT_CHROMA_P_XMM 8
+PREDICT_CHROMA_P_XMM 16
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
@@ -1407,6 +1422,51 @@
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
+; void predict_8x8_vl( pixel *src, pixel *edge )
+;-----------------------------------------------------------------------------
+%macro PREDICT_8x8_VL_10 1
+cglobal predict_8x8_vl, 2,2,8
+ mova m0, [r1+16*SIZEOF_PIXEL]
+ mova m1, [r1+24*SIZEOF_PIXEL]
+ PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4
+ PSRLPIX m4, m1, 1
+ pavg%1 m6, m0, m2
+ pavg%1 m7, m1, m4
+ add r0, FDEC_STRIDEB*4
+ mova [r0-4*FDEC_STRIDEB], m6
+ PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5
+ mova [r0-2*FDEC_STRIDEB], m3
+ PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
+ mova [r0+0*FDEC_STRIDEB], m3
+ PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
+ mova [r0+2*FDEC_STRIDEB], m7
+ PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
+ PSLLPIX m5, m0, 1
+ PRED8x8_LOWPASS m0, m5, m2, m0, m7
+ PRED8x8_LOWPASS m1, m3, m4, m1, m7
+ PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2
+ mova [r0-3*FDEC_STRIDEB], m4
+ PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2
+ mova [r0-1*FDEC_STRIDEB], m4
+ PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
+ mova [r0+1*FDEC_STRIDEB], m4
+ PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
+ mova [r0+3*FDEC_STRIDEB], m1
+ RET
+%endmacro
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_8x8_VL_10 w
+INIT_XMM ssse3
+PREDICT_8x8_VL_10 w
+INIT_XMM avx
+PREDICT_8x8_VL_10 w
+%else
+INIT_MMX mmx2
+PREDICT_8x8_VL_10 b
+%endif
+
+;-----------------------------------------------------------------------------
; void predict_8x8_hd( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HD 2
@@ -1618,7 +1678,6 @@
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
-INIT_XMM sse2
%macro PREDICT_C_H 1
cglobal predict_8x%1c_h, 1,1
add r0, FDEC_STRIDEB*4
@@ -1627,11 +1686,18 @@
movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
SPLATW m0, m0, 1
mova [r0+FDEC_STRIDEB*Y], m0
+%if mmsize == 8
+ mova [r0+FDEC_STRIDEB*Y+8], m0
+%endif
%assign Y Y+1
%endrep
RET
%endmacro
+INIT_MMX mmx2
+PREDICT_C_H 8
+PREDICT_C_H 16
+INIT_XMM sse2
PREDICT_C_H 8
PREDICT_C_H 16
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/predict-c.c
^
|
@@ -99,6 +99,41 @@
PREDICT_16x16_P( avx )
#endif //!HIGH_BIT_DEPTH
+#define PREDICT_8x16C_P_CORE \
+ int H = 0, V = 0;\
+ for( int i = 0; i < 4; i++ )\
+ H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
+ for( int i = 0; i < 8; i++ )\
+ V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );\
+ int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
+ int b = ( 17 * H + 16 ) >> 5;\
+ int c = ( 5 * V + 32 ) >> 6;
+
+#if HIGH_BIT_DEPTH
+#define PREDICT_8x16_P(name)\
+static void x264_predict_8x16c_p_##name( uint16_t *src )\
+{\
+ PREDICT_8x16C_P_CORE \
+ x264_predict_8x16c_p_core_##name( src, a, b, c );\
+}
+
+PREDICT_8x16_P(sse2)
+PREDICT_8x16_P(avx)
+#else
+#define PREDICT_8x16_P(name)\
+static void x264_predict_8x16c_p_##name( uint8_t *src )\
+{\
+ PREDICT_8x16C_P_CORE \
+ int i00 = a -3*b -7*c + 16;\
+ x264_predict_8x16c_p_core_##name( src, i00, b, c );\
+}
+#ifndef ARCH_X86_64
+PREDICT_8x16_P(mmx2)
+#endif
+PREDICT_8x16_P(sse2)
+PREDICT_8x16_P(avx)
+#endif
+
#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
static void x264_predict_16x16_p_sse2( uint16_t *src )
@@ -338,6 +373,7 @@
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse2;
@@ -386,12 +422,17 @@
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_sse2;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
#else
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
@@ -399,9 +440,18 @@
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
+#ifndef ARCH_X86_64
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_mmx2;
+#endif
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_ssse3;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
#endif // HIGH_BIT_DEPTH
}
@@ -419,6 +469,7 @@
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
@@ -429,6 +480,7 @@
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_ssse3;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
@@ -440,6 +492,7 @@
return;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
*predict_8x8_filter = x264_predict_8x8_filter_avx;
#else
@@ -449,6 +502,7 @@
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2;
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_mmx2;
*predict_8x8_filter = x264_predict_8x8_filter_mmx2;
#if ARCH_X86
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2;
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/predict.h
^
|
@@ -56,9 +56,12 @@
void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
void x264_predict_8x16c_v_mmx( uint8_t *src );
void x264_predict_8x16c_v_sse2( uint16_t *src );
-void x264_predict_8x16c_h_mmx2( uint8_t *src );
+void x264_predict_8x16c_h_mmx2( pixel *src );
void x264_predict_8x16c_h_sse2( pixel *src );
void x264_predict_8x16c_h_ssse3( uint8_t *src );
+void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
@@ -68,7 +71,7 @@
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
void x264_predict_8x8c_v_mmx( pixel *src );
void x264_predict_8x8c_v_sse2( uint16_t *src );
-void x264_predict_8x8c_h_mmx2( uint8_t *src );
+void x264_predict_8x8c_h_mmx2( pixel *src );
void x264_predict_8x8c_h_sse2( pixel *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
@@ -93,8 +96,10 @@
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] );
+void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] );
+void x264_predict_8x8_vl_avx( pixel *src, pixel edge[36] );
+void x264_predict_8x8_vl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/quant-a.asm
^
|
@@ -138,11 +138,7 @@
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
-%if %4
- por m5, m1
-%else
- SWAP 5, 1
-%endif
+ ACCUM por, 5, 1, %4
%else ; !sse4
mova m0, [%1]
ABSD m1, m0
@@ -156,11 +152,7 @@
psrld m1, 16
PSIGND m1, m0
mova [%1], m1
-%if %4
- por m5, m1
-%else
- SWAP 5, 1
-%endif
+ ACCUM por, 5, 1, %4
%endif ; cpuflag
%endmacro
@@ -180,11 +172,7 @@
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
-%if %4
- por m5, m2
-%else
- SWAP 5, 2
-%endif
+ ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
QUANT_ONE_DC %1, %2, %3, %4
@@ -208,11 +196,7 @@
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
-%if %4
- por m5, m1
-%else
- SWAP 5, 1
-%endif
+ ACCUM por, 5, 1, %4
%endmacro
%macro QUANT_TWO_AC 4
@@ -231,11 +215,7 @@
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
-%if %4
- por m5, m2
-%else
- SWAP 5, 2
-%endif
+ ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
QUANT_ONE_AC_MMX %1, %2, %3, %4
@@ -307,11 +287,7 @@
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
mova %1, m0 ; store
-%if %4
- por m5, m0
-%else
- SWAP 5, 0
-%endif
+ ACCUM por, 5, 0, %4
%endmacro
%macro QUANT_TWO 7
@@ -327,13 +303,8 @@
PSIGNW m2, m3
mova %1, m0
mova %2, m2
-%if %7
- por m5, m0
- por m5, m2
-%else
- SWAP 5, 0
+ ACCUM por, 5, 0, %7
por m5, m2
-%endif
%endmacro
;-----------------------------------------------------------------------------
@@ -950,10 +921,10 @@
;This is not true for score64.
cglobal decimate_score%1, 1,3
%ifdef PIC
- lea r10, [decimate_table4]
- lea r11, [decimate_mask_table4]
- %define table r10
- %define mask_table r11
+ lea r4, [decimate_table4]
+ lea r5, [decimate_mask_table4]
+ %define table r4
+ %define mask_table r5
%else
%define table decimate_table4
%define mask_table decimate_mask_table4
@@ -1019,10 +990,10 @@
%macro DECIMATE8x8 0
%ifdef ARCH_X86_64
-cglobal decimate_score64, 1,4
+cglobal decimate_score64, 1,5
%ifdef PIC
- lea r10, [decimate_table8]
- %define table r10
+ lea r4, [decimate_table8]
+ %define table r4
%else
%define table decimate_table8
%endif
@@ -1381,8 +1352,16 @@
movifnidn t1, r1mp
pxor m2, m2
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
- not t5d
- shl t5d, 32-((%1+1)&~1)
+%if %1==15
+ shr t5d, 1
+%elif %1==8
+ and t5d, 0xff
+%elif %1==4
+ and t5d, 0xf
+%endif
+ xor t5d, (1<<%1)-1
+ mov [t1+4], t5d
+ shl t5d, 32-%1
mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
@@ -1394,12 +1373,12 @@
LZCOUNT t3d, t5d, 0x1f
%ifdef HIGH_BIT_DEPTH
mov t2d, [t0+t4*4]
- mov [t1+t6 +4+16*4], t3b
- mov [t1+t6*4+ 4], t2d
+ mov [t1+t6+8+16*4], t3b
+ mov [t1+t6*4+ 8], t2d
%else
mov t2w, [t0+t4*2]
- mov [t1+t6 +4+16*2], t3b
- mov [t1+t6*2+ 4], t2w
+ mov [t1+t6+8+16*2], t3b
+ mov [t1+t6*2+ 8], t2w
%endif
inc t3d
shl t5d, t3b
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/quant.h
^
|
@@ -110,5 +110,17 @@
int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
+int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
+int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
+int x264_trellis_cabac_8x8_ssse3( TRELLIS_PARAMS, int b_interlaced );
+int x264_trellis_cabac_4x4_psy_sse2 ( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
+int x264_trellis_cabac_4x4_psy_ssse3( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
+int x264_trellis_cabac_8x8_psy_sse2 ( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
+int x264_trellis_cabac_8x8_psy_ssse3( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
+int x264_trellis_cabac_dc_sse2 ( TRELLIS_PARAMS, int i_coefs );
+int x264_trellis_cabac_dc_ssse3( TRELLIS_PARAMS, int i_coefs );
+int x264_trellis_cabac_chroma_422_dc_sse2 ( TRELLIS_PARAMS );
+int x264_trellis_cabac_chroma_422_dc_ssse3( TRELLIS_PARAMS );
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/sad-a.asm
^
|
@@ -242,11 +242,7 @@
psadbw m1, m3
psadbw m2, m4
lea r2, [r2+2*r3]
-%if %1
- paddw m0, m1
-%else
- SWAP 0, 1
-%endif
+ ACCUM paddw, 0, 1, %1
paddw m0, m2
%endmacro
@@ -287,10 +283,11 @@
psadbw m3, m5
psadbw m4, m6
psadbw m5, m7
- paddw m0, m2
- paddw m0, m3
- paddw m0, m4
- paddw m0, m5
+ ;max sum: 31*16*255(pixel_max)=126480
+ paddd m0, m2
+ paddd m0, m3
+ paddd m0, m4
+ paddd m0, m5
mova m2, m6
mova m3, m7
sub r2d, 2
@@ -321,7 +318,8 @@
jg .loop
.end:
movhlps m1, m0
- paddw m0, m1
+ ;max sum: 31*16*255(pixel_max)=126480
+ paddd m0, m1
movd eax, m0
RET
@@ -389,25 +387,13 @@
movq m5, [r0+FENC_STRIDE*%1]
movq m4, m5
psadbw m4, m0
-%if %1
- paddw m1, m4
-%else
- SWAP 1, 4
-%endif
+ ACCUM paddw, 1, 4, %1
movq m4, m5
psadbw m4, m6
-%if %1
- paddw m2, m4
-%else
- SWAP 2, 4
-%endif
+ ACCUM paddw, 2, 4, %1
pshufw m4, m7, %2
psadbw m5, m4
-%if %1
- paddw m3, m5
-%else
- SWAP 3, 5
-%endif
+ ACCUM paddw, 3, 5, %1
%endmacro
INIT_MMX
@@ -465,13 +451,8 @@
psadbw m5, m6
paddw m1, m3
paddw m4, m5
-%if %1
- paddw m0, m1
- paddw m2, m4
-%else
- SWAP 0,1
- SWAP 2,4
-%endif
+ ACCUM paddw, 0, 1, %1
+ ACCUM paddw, 2, 4, %1
%endmacro
%macro INTRA_SAD_8x8C 0
@@ -1436,7 +1417,7 @@
jmp pixel_sad_x3_%1x%2_%4
.split:
%ifdef ARCH_X86_64
- PROLOGUE 6,7
+ PROLOGUE 6,9
%ifdef WIN64
movsxd r4, r4d
sub rsp, 8
@@ -1446,26 +1427,26 @@
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r4
- mov r10, r0
- mov r11, r5
+ mov r7, r0
+ mov r8, r5
call pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
+ mov [r8], eax
%ifdef WIN64
mov r2, [rsp]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
+ mov [r8+4], eax
%ifdef WIN64
mov r2, [rsp+8]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
+ mov [r8+8], eax
%ifdef WIN64
add rsp, 24
%endif
@@ -1502,8 +1483,8 @@
jmp pixel_sad_x4_%1x%2_%4
.split:
%ifdef ARCH_X86_64
- PROLOGUE 6,7
- mov r11, r6mp
+ PROLOGUE 6,9
+ mov r8, r6mp
%ifdef WIN64
movsxd r5, r5d
%endif
@@ -1513,33 +1494,33 @@
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r5
- mov r10, r0
+ mov r7, r0
call pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
+ mov [r8], eax
%ifdef WIN64
mov r2, [rsp]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
+ mov [r8+4], eax
%ifdef WIN64
mov r2, [rsp+8]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
+ mov [r8+8], eax
%ifdef WIN64
mov r2, [rsp+16]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+12], eax
+ mov [r8+12], eax
%ifdef WIN64
add rsp, 24
%endif
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/sad16-a.asm
^
|
@@ -29,6 +29,8 @@
SECTION .text
cextern pw_1
+cextern pw_4
+cextern pw_8
;=============================================================================
; SAD MMX
@@ -347,6 +349,57 @@
%endrep
%endmacro
+%macro PIXEL_VSAD 0
+cglobal pixel_vsad, 3,3,8
+ mova m0, [r0]
+ mova m1, [r0+16]
+ mova m2, [r0+2*r1]
+ mova m3, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m0, m2
+ psubw m1, m3
+ ABSW2 m0, m1, m0, m1, m4, m5
+ paddw m0, m1
+ sub r2d, 2
+ je .end
+.loop:
+ mova m4, [r0]
+ mova m5, [r0+16]
+ mova m6, [r0+2*r1]
+ mova m7, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m2, m4
+ psubw m3, m5
+ psubw m4, m6
+ psubw m5, m7
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ ABSW m4, m4, m1
+ ABSW m5, m5, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ paddw m0, m5
+ mova m2, m6
+ mova m3, m7
+ sub r2d, 2
+ jg .loop
+.end:
+%if BIT_DEPTH == 9
+ HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
+%else
+ HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
+%endif
+ movd eax, m0
+ RET
+%endmacro
+INIT_XMM sse2
+PIXEL_VSAD
+INIT_XMM ssse3
+PIXEL_VSAD
+INIT_XMM xop
+PIXEL_VSAD
+
;-----------------------------------------------------------------------------
; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
; uint16_t *pix2, int i_stride, int scores[3] )
@@ -418,3 +471,129 @@
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+%macro INTRA_SAD_X3_4x4 0
+cglobal intra_sad_x3_4x4, 3,3,7
+ movq m0, [r1-1*FDEC_STRIDEB]
+ movq m1, [r0+0*FENC_STRIDEB]
+ movq m2, [r0+2*FENC_STRIDEB]
+ pshuflw m6, m0, q1032
+ paddw m6, m0
+ pshuflw m5, m6, q2301
+ paddw m6, m5
+ punpcklqdq m6, m6 ;A+B+C+D 8 times
+ punpcklqdq m0, m0
+ movhps m1, [r0+1*FENC_STRIDEB]
+ movhps m2, [r0+3*FENC_STRIDEB]
+ psubw m3, m1, m0
+ psubw m0, m2
+ ABSW m3, m3, m5
+ ABSW m0, m0, m5
+ paddw m0, m3
+ HADDW m0, m5
+ movd [r2], m0 ;V prediction cost
+ movd m3, [r1+0*FDEC_STRIDEB-4]
+ movhps m3, [r1+1*FDEC_STRIDEB-8]
+ movd m4, [r1+2*FDEC_STRIDEB-4]
+ movhps m4, [r1+3*FDEC_STRIDEB-8]
+ pshufhw m3, m3, q3333
+ pshufhw m4, m4, q3333
+ pshuflw m3, m3, q1111 ; FF FF EE EE
+ pshuflw m4, m4, q1111 ; HH HH GG GG
+ paddw m5, m3, m4
+ pshufd m0, m5, q1032
+ paddw m5, m6
+ paddw m5, m0
+ paddw m5, [pw_4]
+ psrlw m5, 3
+ psubw m6, m5, m2
+ psubw m5, m1
+ psubw m1, m3
+ psubw m2, m4
+ ABSW m5, m5, m0
+ ABSW m6, m6, m0
+ ABSW m1, m1, m0
+ ABSW m2, m2, m0
+ paddw m5, m6
+ paddw m1, m2
+ HADDW m5, m0
+ HADDW m1, m2
+ movd [r2+8], m5 ;DC prediction cost
+ movd [r2+4], m1 ;H prediction cost
+ RET
+%endmacro
+
+INIT_XMM sse2
+INTRA_SAD_X3_4x4
+INIT_XMM ssse3
+INTRA_SAD_X3_4x4
+INIT_XMM avx
+INTRA_SAD_X3_4x4
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
+;-----------------------------------------------------------------------------
+
+;m0 = DC
+;m6 = V
+;m7 = H
+;m1 = DC score
+;m2 = V score
+;m3 = H score
+;m5 = temp
+;m4 = pixel row
+
+%macro INTRA_SAD_HVDC_ITER 2
+ mova m4, [r0+(%1-4)*FENC_STRIDEB]
+ psubw m4, m0
+ ABSW m4, m4, m5
+ ACCUM paddw, 1, 4, %1
+ mova m4, [r0+(%1-4)*FENC_STRIDEB]
+ psubw m4, m6
+ ABSW m4, m4, m5
+ ACCUM paddw, 2, 4, %1
+ pshufd m5, m7, %2
+ psubw m5, [r0+(%1-4)*FENC_STRIDEB]
+ ABSW m5, m5, m4
+ ACCUM paddw, 3, 5, %1
+%endmacro
+
+%macro INTRA_SAD_X3_8x8 0
+cglobal intra_sad_x3_8x8, 3,3,8
+ add r0, 4*FENC_STRIDEB
+ movu m0, [r1+7*SIZEOF_PIXEL]
+ mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
+ mova m7, m0
+ paddw m0, m6
+ punpckhwd m7, m7
+ HADDW m0, m4
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ SPLATW m0, m0
+ INTRA_SAD_HVDC_ITER 0, q3333
+ INTRA_SAD_HVDC_ITER 1, q2222
+ INTRA_SAD_HVDC_ITER 2, q1111
+ INTRA_SAD_HVDC_ITER 3, q0000
+ movq m7, [r1+7*SIZEOF_PIXEL]
+ punpcklwd m7, m7
+ INTRA_SAD_HVDC_ITER 4, q3333
+ INTRA_SAD_HVDC_ITER 5, q2222
+ INTRA_SAD_HVDC_ITER 6, q1111
+ INTRA_SAD_HVDC_ITER 7, q0000
+ HADDW m2, m4
+ HADDW m3, m4
+ HADDW m1, m4
+ movd [r2+0], m2
+ movd [r2+4], m3
+ movd [r2+8], m1
+ RET
+%endmacro
+
+INIT_XMM sse2
+INTRA_SAD_X3_8x8
+INIT_XMM ssse3
+INTRA_SAD_X3_8x8
|
[-]
[+]
|
Added |
x264-snapshot-20120126-2245.tar.bz2/common/x86/trellis-64.asm
^
|
@@ -0,0 +1,890 @@
+;*****************************************************************************
+;* trellis-64.asm: x86_64 trellis quantization
+;*****************************************************************************
+;* Copyright (C) 2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing@x264.com.
+;*****************************************************************************
+
+; This is a pretty straight-forward translation of the C code, except:
+; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
+; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
+; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those
+; nodes are invalid).
+; * Interprocedural register allocation. Eliminates argument-passing overhead
+; to trellis_coef* subroutines. Also reduces codesize.
+
+; Optimizations that I tried, and rejected because they were not faster:
+; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
+; Costs too much icache compared to the negligible speedup.
+; * There are only 21 possible sets of live node_ctxs; we could keep track of
+; exactly which set we're in and feed that (along with abs_level) into a jump
+; table instead of the switch to select a trellis_coef subroutine. This would
+; eliminate all branches about which node_ctxs are live, but costs either a
+; bunch of icache or a bunch of call/ret, and the jump table itself is
+; unpredictable.
+; * Separate versions of trellis_coef* depending on whether we're doing the 1st
+; or the 2nd of the two abs_level candidates. This would eliminate some
+; branches about if(score is better).
+; * Special case more values of coef. I had a coef2 at some intermediate point
+; in the optimization process, but it didn't end up worthwhile in conjunction
+; with all the other optimizations.
+; * Unroll or simd writeback. I don't know why this didn't help.
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+pd_8: times 4 dd 8
+pd_m16: times 4 dd -16
+pd_0123: dd 0, 1, 2, 3
+pd_4567: dd 4, 5, 6, 7
+sq_1: dq 1, 0
+pq_128: times 2 dq 128
+pq_ffffffff: times 2 dq 0xffffffff
+
+cextern cabac_entropy
+cextern cabac_transition
+cextern cabac_size_unary
+cextern cabac_transition_unary
+cextern dct4_weight_tab
+cextern dct8_weight_tab
+cextern dct4_weight2_tab
+cextern dct8_weight2_tab
+cextern last_coeff_flag_offset_8x8
+cextern significant_coeff_flag_offset_8x8
+cextern coeff_flag_offset_chroma_422_dc
+
+SECTION .text
+
+%define TRELLIS_SCORE_BIAS 1<<60
+%define SIZEOF_NODE 16
+%define CABAC_SIZE_BITS 8
+%define LAMBDA_BITS 4
+
+%macro SQUARE 2 ; dst, tmp
+ ; could use pmuldq here, to eliminate the abs. but that would involve
+ ; templating a sse4 version of all of trellis, for negligible speedup.
+%if cpuflag(ssse3)
+ pabsd m%1, m%1
+ pmuludq m%1, m%1
+%elifdef HIGH_BIT_DEPTH
+ ABSD m%2, m%1
+ SWAP %1, %2
+ pmuludq m%1, m%1
+%else
+ pmuludq m%1, m%1
+ pand m%1, [pq_ffffffff]
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int trellis_cabac_4x4_psy(
+; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
+; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
+; uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
+; uint64_t level_state0, uint16_t level_state1,
+; int b_ac, dctcoef *fenc_dct, int psy_trellis )
+;-----------------------------------------------------------------------------
+%macro TRELLIS 4
+%define num_coefs %2
+%define dc %3
+%define psy %4
+cglobal %1, 4,15,9
+ %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
+ %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
+ SUB rsp, pad
+ DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
+%ifdef WIN64
+ %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
+%else
+ %define level_statem rsp+stack_offset+32
+%endif
+ %define b_acm r11m ; 4x4 only
+ %define b_interlacedm r11m ; 8x8 only
+ %define i_coefsm1 r11m ; dc only
+ %define fenc_dctm r12m
+ %define psy_trellism r13m
+%if num_coefs == 64
+ shl dword b_interlacedm, 6
+ %define dct_weight1_tab dct8_weight_tab
+ %define dct_weight2_tab dct8_weight2_tab
+%else
+ %define dct_weight1_tab dct4_weight_tab
+ %define dct_weight2_tab dct4_weight2_tab
+%endif
+
+ %define stack rsp
+ %define last_nnzm [stack+0]
+ %define zigzagm [stack+8]
+ mov last_nnzm, iid
+ mov zigzagm, zigzagq
+%ifndef WIN64
+ %define orig_coefsm [stack+16]
+ %define quant_coefsm [stack+24]
+ mov orig_coefsm, orig_coefsq
+ mov quant_coefsm, quant_coefsq
+%endif
+ %define unquant_mfm [stack+32]
+ %define levelgt1_ctxm [stack+40]
+ %define ssd stack+48
+ %define cost_siglast stack+80
+ %define level_tree stack+96
+
+ ; trellis_node_t is layed out differently than C.
+ ; struct-of-arrays rather than array-of-structs, for simd.
+ %define nodes_curq r7
+ %define nodes_prevq r8
+ %define node_score(x) x*8
+ %define node_level_idx(x) 64+x*4
+ %define node_cabac_state(x) 96+x*4
+ lea nodes_curq, [level_tree + level_tree_size]
+ lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
+ mov r6, TRELLIS_SCORE_BIAS
+ mov [nodes_curq + node_score(0)], r6
+ mov dword [nodes_curq + node_level_idx(0)], 0
+ movd mm0, [level_statem + 0]
+ punpcklbw mm0, [level_statem + 4]
+ punpcklwd mm0, [level_statem + 8]
+ %define level_state_packed mm0 ; version for copying into node.cabac_state
+ pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX
+ movq [nodes_curq + node_score(1)], m7
+ mova [nodes_curq + node_score(2)], m7
+
+ %define levels_usedq r4
+ %define levels_usedd r4d
+ mov dword [level_tree], 0
+ mov levels_usedd, 1
+
+ %define abs_levelq r9
+ %define abs_leveld r9d
+ %define abs_coefq r14
+ %define zigzagiq r5
+ %define zigzagid r5d
+
+%if num_coefs == 8
+ mov dword levelgt1_ctxm, 8
+%else
+ mov dword levelgt1_ctxm, 9
+%endif
+%if psy
+ movd m6, psy_trellism
+ %define psy_trellis m6
+%elif dc
+ movd m6, [unquant_mfq]
+ paddd m6, m6
+ punpcklqdq m6, m6
+ %define unquant_mf m6
+%endif
+%ifdef PIC
+%if dc == 0
+ mov unquant_mfm, unquant_mfq
+%endif
+ ; Keep a single offset register to PICify all global constants.
+ ; They're all relative to "beginning of this asm file's .text section",
+ ; even tables that aren't in this file.
+ ; (Any address in .text would work, this one was just convenient.)
+ lea r0, [$$]
+ %define GLOBAL +r0-$$
+%else
+ %define GLOBAL
+%endif
+
+ TRELLIS_LOOP 0 ; node_ctx 0..3
+ TRELLIS_LOOP 1 ; node_ctx 1..7
+
+.writeback:
+ ; int level = bnode->level_idx;
+ ; for( int i = b_ac; i <= last_nnz; i++ )
+ ; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
+ ; level = level_tree[level].next;
+ mov iid, last_nnzm
+ add zigzagq, iiq
+ neg iiq
+%if num_coefs == 16 && dc == 0
+ mov r2d, b_acm
+ add iiq, r2
+%endif
+ %define dctq r10
+ mov r0d, [nodes_curq + node_level_idx(0) + rax*4]
+.writeback_loop:
+ movzx r2, byte [zigzagq + iiq]
+%if cpuflag(ssse3)
+ movd m0, [level_tree + r0*4]
+ movzx r0, word [level_tree + r0*4]
+ psrld m0, 16
+ movd m1, [dctq + r2*SIZEOF_DCTCOEF]
+%ifdef HIGH_BIT_DEPTH
+ psignd m0, m1
+ movd [dctq + r2*SIZEOF_DCTCOEF], m0
+%else
+ psignw m0, m1
+ movd r4d, m0
+ mov [dctq + r2*SIZEOF_DCTCOEF], r4w
+%endif
+%else
+ mov r5d, [level_tree + r0*4]
+%ifdef HIGH_BIT_DEPTH
+ mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
+%else
+ movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
+%endif
+ movzx r0d, r5w
+ sar r4d, 31
+ shr r5d, 16
+ xor r5d, r4d
+ sub r5d, r4d
+%ifdef HIGH_BIT_DEPTH
+ mov [dctq + r2*SIZEOF_DCTCOEF], r5d
+%else
+ mov [dctq + r2*SIZEOF_DCTCOEF], r5w
+%endif
+%endif
+ inc iiq
+ jle .writeback_loop
+
+ mov eax, 1
+.return:
+ ADD rsp, pad
+ RET
+
+%if num_coefs == 16 && dc == 0
+.return_zero:
+ pxor m0, m0
+ mova [r10+ 0], m0
+ mova [r10+16], m0
+%ifdef HIGH_BIT_DEPTH
+ mova [r10+32], m0
+ mova [r10+48], m0
+%endif
+ jmp .return
+%endif
+%endmacro ; TRELLIS
+
+
+
+%macro TRELLIS_LOOP 1 ; ctx_hi
+.i_loop%1:
+ ; if( !quant_coefs[i] )
+ mov r6, quant_coefsm
+%ifdef HIGH_BIT_DEPTH
+ mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
+%else
+ movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
+%endif
+
+ ; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
+ ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
+ mov r10, cabac_state_sigm
+%if num_coefs == 64
+ mov r6d, b_interlacedm
+%ifdef PIC
+ add r6d, iid
+ movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
+%else
+ movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
+%endif
+ movzx r10, byte [r10 + r6]
+%elif num_coefs == 8
+ movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
+ movzx r10, byte [r10 + r13]
+%else
+ movzx r10, byte [r10 + iiq]
+%endif
+
+ test abs_leveld, abs_leveld
+ jnz %%.nonzero_quant_coef
+
+%if %1 == 0
+ ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
+ ; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+ ; nodes_cur[0].score -= cost_sig0;
+ movzx r10, word [cabac_entropy + r10*2 GLOBAL]
+ imul r10, lambda2q
+ shr r10, CABAC_SIZE_BITS - LAMBDA_BITS
+ sub [nodes_curq + node_score(0)], r10
+%endif
+ ZERO_LEVEL_IDX %1, cur
+ jmp .i_continue%1
+
+%%.nonzero_quant_coef:
+ ; int sign_coef = orig_coefs[zigzag[i]];
+ ; int abs_coef = abs( sign_coef );
+ ; int q = abs( quant_coefs[i] );
+ movzx zigzagid, byte [zigzagq+iiq]
+ movd m0, abs_leveld
+ mov r6, orig_coefsm
+%ifdef HIGH_BIT_DEPTH
+ movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+%else
+ movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+ psrad m1, 16
+%endif
+ punpcklqdq m0, m0 ; quant_coef
+ punpcklqdq m1, m1 ; sign_coef
+%if cpuflag(ssse3)
+ pabsd m0, m0
+ pabsd m2, m1 ; abs_coef
+%else
+ pxor m8, m8
+ pcmpgtd m8, m1 ; sign_mask
+ pxor m0, m8
+ pxor m2, m1, m8
+ psubd m0, m8
+ psubd m2, m8
+%endif
+ psubd m0, [sq_1] ; abs_level
+ movd abs_leveld, m0
+
+ xchg nodes_curq, nodes_prevq
+
+ ; if( i < num_coefs-1 )
+ ; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
+ ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i
+ ; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
+ ; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
+ ; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
+ ; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
+%if %1 == 0
+%if dc && num_coefs != 8
+ cmp iid, i_coefsm1
+%else
+ cmp iid, num_coefs-1
+%endif
+ je %%.zero_siglast
+%endif
+ movzx r11, word [cabac_entropy + r10*2 GLOBAL]
+ xor r10, 1
+ movzx r12, word [cabac_entropy + r10*2 GLOBAL]
+ mov [cost_siglast+0], r11d
+ mov r10, cabac_state_lastm
+%if num_coefs == 64
+ movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
+ movzx r10, byte [r10 + r6]
+%elif num_coefs == 8
+ movzx r10, byte [r10 + r13]
+%else
+ movzx r10, byte [r10 + iiq]
+%endif
+ movzx r11, word [cabac_entropy + r10*2 GLOBAL]
+ add r11, r12
+ mov [cost_siglast+4], r11d
+%if %1 == 0
+ xor r10, 1
+ movzx r10, word [cabac_entropy + r10*2 GLOBAL]
+ add r10, r12
+ mov [cost_siglast+8], r10d
+%endif
+%%.skip_siglast:
+
+ ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
+ ; int d = abs_coef - unquant_abs_level;
+ ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
+%if dc
+ pmuludq m0, unquant_mf
+%else
+%ifdef PIC
+ mov r10, unquant_mfm
+ movd m3, [r10 + zigzagiq*4]
+%else
+ movd m3, [unquant_mfq + zigzagiq*4]
+%endif
+ punpcklqdq m3, m3
+ pmuludq m0, m3
+%endif
+ paddd m0, [pq_128]
+ psrld m0, 8 ; unquant_abs_level
+%if psy || dc == 0
+ mova m4, m0
+%endif
+ psubd m0, m2
+ SQUARE 0, 3
+%if dc
+ psllq m0, 8
+%else
+ movd m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
+ punpcklqdq m5, m5
+ pmuludq m0, m5
+%endif
+
+%if psy
+ test iid, iid
+ jz %%.dc_rounding
+ ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
+ ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
+ ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
+ ; ssd1[k] -= psy_weight * psy_value;
+ mov r6, fenc_dctm
+%ifdef HIGH_BIT_DEPTH
+ movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+%else
+ movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+ psrad m3, 16 ; orig_coef
+%endif
+ punpcklqdq m3, m3
+%if cpuflag(ssse3)
+ psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
+%else
+ PSIGN d, m4, m8
+%endif
+ psubd m3, m1 ; predicted_coef
+ paddd m4, m3
+%if cpuflag(ssse3)
+ pabsd m4, m4
+%else
+ ABSD m3, m4
+ SWAP 4, 3
+%endif
+ movd m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
+ pmuludq m1, psy_trellis
+ punpcklqdq m1, m1
+ pmuludq m4, m1
+ psubq m0, m4
+%if %1
+%%.dc_rounding:
+%endif
+%endif
+%if %1 == 0
+ mova [ssd], m0
+%endif
+
+%if dc == 0 && %1 == 0
+ test iid, iid
+ jnz %%.skip_dc_rounding
+%%.dc_rounding:
+ ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
+ ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
+ ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
+ psrad m1, 31 ; sign_coef>>31
+ paddd m4, [pd_8]
+ paddd m4, m1
+ pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
+ psubd m4, m2 ; d
+ SQUARE 4, 3
+ pmuludq m4, m5
+ mova [ssd], m4
+%%.skip_dc_rounding:
+%endif
+ mova [ssd+16], m0
+
+ %assign stack_offset_bak stack_offset
+ cmp abs_leveld, 1
+ jl %%.switch_coef0
+%if %1 == 0
+ mov r10, [ssd] ; trellis_coef* args
+%endif
+ movq r12, m0
+ ; for( int j = 0; j < 8; j++ )
+ ; nodes_cur[j].score = TRELLIS_SCORE_MAX;
+%if cpuflag(ssse3)
+ mova [nodes_curq + node_score(0)], m7
+ mova [nodes_curq + node_score(2)], m7
+%else ; avoid store-forwarding stalls on k8/k10
+%if %1 == 0
+ movq [nodes_curq + node_score(0)], m7
+%endif
+ movq [nodes_curq + node_score(1)], m7
+ movq [nodes_curq + node_score(2)], m7
+ movq [nodes_curq + node_score(3)], m7
+%endif
+ mova [nodes_curq + node_score(4)], m7
+ mova [nodes_curq + node_score(6)], m7
+ je %%.switch_coef1
+%%.switch_coefn:
+ call trellis_coefn.entry%1
+ call trellis_coefn.entry%1b
+ jmp .i_continue1
+%%.switch_coef1:
+ call trellis_coef1.entry%1
+ call trellis_coefn.entry%1b
+ jmp .i_continue1
+%%.switch_coef0:
+ call trellis_coef0_%1
+ call trellis_coef1.entry%1b
+
+.i_continue%1:
+ dec iid
+%if num_coefs == 16 && dc == 0
+ cmp iid, b_acm
+%endif
+ jge .i_loop%1
+
+ call trellis_bnode_%1
+%if %1 == 0
+%if num_coefs == 16 && dc == 0
+ jz .return_zero
+%else
+ jz .return
+%endif
+ jmp .writeback
+
+%%.zero_siglast:
+ xor r6d, r6d
+ mov [cost_siglast+0], r6
+ mov [cost_siglast+8], r6d
+ jmp %%.skip_siglast
+%endif
+%endmacro ; TRELLIS_LOOP
+
+; just a synonym for %if
+%macro IF0 1+
+%endmacro
+%macro IF1 1+
+ %1
+%endmacro
+
+%macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
+ ; for( int j = 0; j < 8; j++ )
+ ; nodes_cur[j].level_idx = levels_used;
+ ; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
+ ; levels_used++;
+ add levels_usedd, 3
+ and levels_usedd, ~3 ; allow aligned stores
+ movd m0, levels_usedd
+ pshufd m0, m0, 0
+ IF%1 mova m1, m0
+ paddd m0, [pd_0123]
+ IF%1 paddd m1, [pd_4567]
+ mova m2, [nodes_%2q + node_level_idx(0)]
+ IF%1 mova m3, [nodes_%2q + node_level_idx(4)]
+ mova [nodes_curq + node_level_idx(0)], m0
+ IF%1 mova [nodes_curq + node_level_idx(4)], m1
+ mova [level_tree + (levels_usedq+0)*4], m2
+ IF%1 mova [level_tree + (levels_usedq+4)*4], m3
+ add levels_usedd, (1+%1)*4
+%endmacro
+
+INIT_XMM sse2
+TRELLIS trellis_cabac_4x4, 16, 0, 0
+TRELLIS trellis_cabac_8x8, 64, 0, 0
+TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
+TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
+TRELLIS trellis_cabac_dc, 16, 1, 0
+TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
+INIT_XMM ssse3
+TRELLIS trellis_cabac_4x4, 16, 0, 0
+TRELLIS trellis_cabac_8x8, 64, 0, 0
+TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
+TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
+TRELLIS trellis_cabac_dc, 16, 1, 0
+TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
+
+
+
+%define stack rsp+gprsize
+%define scoreq r14
+%define bitsq r13
+%define bitsd r13d
+
+INIT_XMM
+%macro clocal 1
+ ALIGN 16
+ global mangle(x264_%1)
+ mangle(x264_%1):
+ %1:
+ %assign stack_offset stack_offset_bak+gprsize
+%endmacro
+
+%macro TRELLIS_BNODE 1 ; ctx_hi
+clocal trellis_bnode_%1
+ ; int j = ctx_hi?1:0;
+ ; trellis_node_t *bnode = &nodes_cur[j];
+ ; while( ++j < (ctx_hi?8:4) )
+ ; if( nodes_cur[j].score < bnode->score )
+ ; bnode = &nodes_cur[j];
+%assign j %1
+ mov rax, [nodes_curq + node_score(j)]
+ lea rax, [rax*8 + j]
+%rep 3+3*%1
+%assign j j+1
+ mov r11, [nodes_curq + node_score(j)]
+ lea r11, [r11*8 + j]
+ cmp rax, r11
+ cmova rax, r11
+%endrep
+ mov r10, dctm
+ and eax, 7
+ ret
+%endmacro ; TRELLIS_BNODE
+TRELLIS_BNODE 0
+TRELLIS_BNODE 1
+
+
+%macro TRELLIS_COEF0 1 ; ctx_hi
+clocal trellis_coef0_%1
+ ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+ mov r11d, [cost_siglast+0]
+ imul r11, lambda2q
+ shr r11, CABAC_SIZE_BITS - LAMBDA_BITS
+ add r11, [ssd+16]
+%if %1 == 0
+ ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
+ mov scoreq, [nodes_prevq + node_score(0)]
+ add scoreq, [ssd]
+ sub scoreq, r11
+ mov [nodes_curq + node_score(0)], scoreq
+%endif
+ ; memcpy
+ mov scoreq, [nodes_prevq + node_score(1)]
+ mov [nodes_curq + node_score(1)], scoreq
+ mova m1, [nodes_prevq + node_score(2)]
+ mova [nodes_curq + node_score(2)], m1
+%if %1
+ mova m1, [nodes_prevq + node_score(4)]
+ mova [nodes_curq + node_score(4)], m1
+ mova m1, [nodes_prevq + node_score(6)]
+ mova [nodes_curq + node_score(6)], m1
+%endif
+ mov r6d, [nodes_prevq + node_cabac_state(3)]
+ mov [nodes_curq + node_cabac_state(3)], r6d
+%if %1
+ mova m1, [nodes_prevq + node_cabac_state(4)]
+ mova [nodes_curq + node_cabac_state(4)], m1
+%endif
+ ZERO_LEVEL_IDX %1, prev
+ ret
+%endmacro ; TRELLIS_COEF0
+TRELLIS_COEF0 0
+TRELLIS_COEF0 1
+
+
+
+%macro START_COEF 1 ; gt1
+ ; if( (int64_t)nodes_prev[0].score < 0 ) continue;
+ mov scoreq, [nodes_prevq + node_score(j)]
+%if j > 0
+ test scoreq, scoreq
+ js .ctx %+ nextj_if_invalid
+%endif
+
+ ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
+%if j >= 3
+ movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
+ movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
+%else
+ movzx r6d, byte [level_statem + coeff_abs_level1_offs]
+%endif
+%if %1
+ xor r6d, 1
+%endif
+ movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
+
+ ; n.score += ssd;
+ ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
+%if j == 0
+ add scoreq, r10
+ add bitsd, [cost_siglast+8]
+%else
+ add scoreq, r12
+ add bitsd, [cost_siglast+4]
+%endif
+%endmacro ; START_COEF
+
+%macro END_COEF 1
+ ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+ imul bitsq, lambda2q
+ shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
+ add scoreq, bitsq
+
+ ; if( n.score < nodes_cur[node_ctx].score )
+ ; SET_LEVEL( n, abs_level );
+ ; nodes_cur[node_ctx] = n;
+ cmp scoreq, [nodes_curq + node_score(node_ctx)]
+ jae .ctx %+ nextj_if_valid
+ mov [nodes_curq + node_score(node_ctx)], scoreq
+%if j == 2 || (j <= 3 && node_ctx == 4)
+ ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
+ movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
+%elif j >= 3
+ ; if we have updated before, then copy cabac_state from the parent node
+ mov r6d, [nodes_prevq + node_cabac_state(j)]
+ mov [nodes_curq + node_cabac_state(node_ctx)], r6d
+%endif
+%if j >= 3 ; skip the transition if we're not going to reuse the context
+ mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
+%endif
+%if %1 && node_ctx == 7
+ mov r6d, levelgt1_ctxm
+ mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
+%endif
+ mov r6d, [nodes_prevq + node_level_idx(j)]
+%if %1
+ mov r11d, abs_leveld
+ shl r11d, 16
+ or r6d, r11d
+%else
+ or r6d, 1<<16
+%endif
+ mov [level_tree + levels_usedq*4], r6d
+ mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
+ inc levels_usedd
+%endmacro ; END_COEF
+
+
+
+%macro COEF1 2
+ %assign j %1
+ %assign nextj_if_valid %1+1
+ %assign nextj_if_invalid %2
+%if j < 4
+ %assign coeff_abs_level1_offs j+1
+%else
+ %assign coeff_abs_level1_offs 0
+%endif
+%if j < 3
+ %assign node_ctx j+1
+%else
+ %assign node_ctx j
+%endif
+.ctx %+ j:
+ START_COEF 0
+ add bitsd, 1 << CABAC_SIZE_BITS
+ END_COEF 0
+%endmacro ; COEF1
+
+%macro COEFN 2
+ %assign j %1
+ %assign nextj_if_valid %2
+ %assign nextj_if_invalid %2
+%if j < 4
+ %assign coeff_abs_level1_offs j+1
+ %assign coeff_abs_levelgt1_offs 5
+%else
+ %assign coeff_abs_level1_offs 0
+ %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
+%endif
+%if j < 4
+ %assign node_ctx 4
+%elif j < 7
+ %assign node_ctx j+1
+%else
+ %assign node_ctx 7
+%endif
+.ctx %+ j:
+ START_COEF 1
+ ; if( abs_level >= 15 )
+ ; bits += bs_size_ue_big(...)
+ add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
+ ; n.cabac_state[levelgt1_ctx]
+%if j == 7 ; && compiling support for 4:2:2
+ mov r6d, levelgt1_ctxm
+ %define coeff_abs_levelgt1_offs r6
+%endif
+%if j == 7
+ movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
+%else
+ movzx r10, byte [level_statem + coeff_abs_levelgt1_offs]
+%endif
+ ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
+ add r10d, r1d
+ movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
+ add bitsd, r6d
+%if node_ctx == 7
+ movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL]
+%endif
+ END_COEF 1
+%endmacro ; COEFN
+
+
+
+clocal trellis_coef1
+.entry0b: ; ctx_lo, larger of the two abs_level candidates
+ mov r10, [ssd+8]
+ sub r10, r11
+ mov r12, [ssd+24]
+ sub r12, r11
+.entry0: ; ctx_lo, smaller of the two abs_level candidates
+ COEF1 0, 4
+ COEF1 1, 4
+ COEF1 2, 4
+ COEF1 3, 4
+.ctx4:
+ rep ret
+.entry1b: ; ctx_hi, larger of the two abs_level candidates
+ mov r12, [ssd+24]
+ sub r12, r11
+.entry1: ; ctx_hi, smaller of the two abs_level candidates
+trellis_coef1_hi:
+ COEF1 1, 2
+ COEF1 2, 3
+ COEF1 3, 4
+ COEF1 4, 5
+ COEF1 5, 6
+ COEF1 6, 7
+ COEF1 7, 8
+.ctx8:
+ rep ret
+
+%macro COEFN_PREFIX 1
+ ; int prefix = X264_MIN( abs_level - 1, 14 );
+ mov r1d, abs_leveld
+ cmp abs_leveld, 15
+ jge .level_suffix%1
+ xor r5d, r5d
+.skip_level_suffix%1:
+ shl r1d, 7
+%endmacro
+
+%macro COEFN_SUFFIX 1
+.level_suffix%1:
+ ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
+ lea r5d, [abs_levelq-14]
+ bsr r5d, r5d
+ shl r5d, CABAC_SIZE_BITS+1
+ add r5d, 1<<CABAC_SIZE_BITS
+ ; int prefix = X264_MIN( abs_level - 1, 14 );
+ mov r1d, 15
+ jmp .skip_level_suffix%1
+%endmacro
+
+clocal trellis_coefn
+.entry0b:
+ mov r10, [ssd+8]
+ mov r12, [ssd+24]
+ inc abs_leveld
+.entry0:
+ ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
+ ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
+ ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
+ ; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
+ ; The C version has to be fully separate since C doesn't support multiple
+ ; entrypoints. But return-on-first-failure isn't very important here (as
+ ; opposed to coef1), so I might as well reduce codesize.
+ COEFN_PREFIX 0
+ COEFN 0, 1
+ COEFN 1, 2
+ COEFN 2, 3
+ COEFN 3, 8
+.ctx8:
+ mov zigzagq, zigzagm ; unspill since r1 was clobbered
+ ret
+.entry1b:
+ mov r12, [ssd+24]
+ inc abs_leveld
+.entry1:
+ COEFN_PREFIX 1
+ COEFN 4, 5
+ COEFN 5, 6
+ COEFN 6, 7
+ COEFN 7, 1
+ jmp .ctx1
+ COEFN_SUFFIX 0
+ COEFN_SUFFIX 1
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/x86inc.asm
^
|
@@ -1,11 +1,12 @@
;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
+;* Copyright (C) 2005-2012 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Anton Mitrofanov <BugMaster@narod.ru>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Henrik Gramner <hengar-6@student.ltu.se>
;*
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
@@ -91,6 +92,9 @@
default rel
%endif
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU intelnop
+
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
@@ -124,18 +128,20 @@
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size
-%macro DECLARE_REG 6
+%macro DECLARE_REG 5-6
%define r%1q %2
%define r%1d %3
%define r%1w %4
%define r%1b %5
- %define r%1m %6
- %ifid %6 ; i.e. it's a register
+ %if %0 == 5
+ %define r%1m %3
%define r%1mp %2
%elifdef ARCH_X86_64 ; memory
- %define r%1mp qword %6
+ %define r%1m [rsp + stack_offset + %6]
+ %define r%1mp qword r %+ %1m
%else
- %define r%1mp dword %6
+ %define r%1m [esp + stack_offset + %6]
+ %define r%1mp dword r %+ %1m
%endif
%define r%1 %2
%endmacro
@@ -183,7 +189,7 @@
%endrep
%endmacro
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%ifdef ARCH_X86_64
%define gprsize 8
@@ -201,6 +207,33 @@
%assign stack_offset stack_offset-gprsize
%endmacro
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
%macro SUB 2
sub %1, %2
%ifidn %1, rsp
@@ -247,6 +280,8 @@
%endrep
%endif
+ %assign %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
%assign %%i 0
%rep %0
%xdefine %1q r %+ %%i %+ q
@@ -258,40 +293,36 @@
%assign %%i %%i+1
%rotate 1
%endrep
- %assign n_arg_names %%i
+ %assign stack_offset %%stack_offset
+ %assign n_arg_names %0
%endmacro
%ifdef WIN64 ; Windows x64 ;=================================================
-DECLARE_REG 0, rcx, ecx, cx, cl, ecx
-DECLARE_REG 1, rdx, edx, dx, dl, edx
-DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
-DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
-DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
-DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
-DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
-%define r7m [rsp + stack_offset + 64]
-%define r8m [rsp + stack_offset + 72]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [rsp + stack_offset + 8 + %1*8]
- %endif
-%endmacro
+DECLARE_REG 0, rcx, ecx, cx, cl
+DECLARE_REG 1, rdx, edx, dx, dl
+DECLARE_REG 2, R8, R8D, R8W, R8B
+DECLARE_REG 3, R9, R9D, R9W, R9B
+DECLARE_REG 4, R10, R10D, R10W, R10B, 40
+DECLARE_REG 5, R11, R11D, R11W, R11B, 48
+DECLARE_REG 6, rax, eax, ax, al, 56
+DECLARE_REG 7, rdi, edi, di, dil, 64
+DECLARE_REG 8, rsi, esi, si, sil, 72
+DECLARE_REG 9, rbx, ebx, bx, bl, 80
+DECLARE_REG 10, rbp, ebp, bp, bpl, 88
+DECLARE_REG 11, R12, R12D, R12W, R12B, 96
+DECLARE_REG 12, R13, R13D, R13W, R13B, 104
+DECLARE_REG 13, R14, R14D, R14W, R14B, 112
+DECLARE_REG 14, R15, R15D, R15W, R15B, 120
%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
+ %assign num_args %1
%assign regs_used %2
- ASSERT regs_used <= 7
- %if regs_used > 4
- push r4
- push r5
- %assign stack_offset stack_offset+16
- %endif
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
WIN64_SPILL_XMM %3
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
%endmacro
@@ -302,12 +333,11 @@
%endif
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
- sub rsp, (xmm_regs_used-6)*16+16
- %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
+ SUB rsp, (xmm_regs_used-6)*16+16
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
+ movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
%endrep
%endif
%endmacro
@@ -317,7 +347,7 @@
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
%endrep
add %1, (xmm_regs_used-6)*16+16
%endif
@@ -331,15 +361,12 @@
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
- %if regs_used > 4
- pop r5
- pop r4
- %endif
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
ret
%endmacro
%macro REP_RET 0
- %if regs_used > 4 || xmm_regs_used > 6
+ %if regs_used > 7 || xmm_regs_used > 6
RET
%else
rep ret
@@ -348,92 +375,80 @@
%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
-DECLARE_REG 0, rdi, edi, di, dil, edi
-DECLARE_REG 1, rsi, esi, si, sil, esi
-DECLARE_REG 2, rdx, edx, dx, dl, edx
-DECLARE_REG 3, rcx, ecx, cx, cl, ecx
-DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
-DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
-DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
-%define r7m [rsp + stack_offset + 16]
-%define r8m [rsp + stack_offset + 24]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [rsp - 40 + %1*8]
- %endif
-%endmacro
+DECLARE_REG 0, rdi, edi, di, dil
+DECLARE_REG 1, rsi, esi, si, sil
+DECLARE_REG 2, rdx, edx, dx, dl
+DECLARE_REG 3, rcx, ecx, cx, cl
+DECLARE_REG 4, R8, R8D, R8W, R8B
+DECLARE_REG 5, R9, R9D, R9W, R9B
+DECLARE_REG 6, rax, eax, ax, al, 8
+DECLARE_REG 7, R10, R10D, R10W, R10B, 16
+DECLARE_REG 8, R11, R11D, R11W, R11B, 24
+DECLARE_REG 9, rbx, ebx, bx, bl, 32
+DECLARE_REG 10, rbp, ebp, bp, bpl, 40
+DECLARE_REG 11, R12, R12D, R12W, R12B, 48
+DECLARE_REG 12, R13, R13D, R13W, R13B, 56
+DECLARE_REG 13, R14, R14D, R14W, R14B, 64
+DECLARE_REG 14, R15, R15D, R15W, R15B, 72
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
- ASSERT %2 <= 7
- LOAD_IF_USED 6, %1
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
%endmacro
%macro RET 0
+ POP_IF_USED 14, 13, 12, 11, 10, 9
ret
%endmacro
%macro REP_RET 0
- rep ret
+ %if regs_used > 9
+ RET
+ %else
+ rep ret
+ %endif
%endmacro
%else ; X86_32 ;==============================================================
-DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
-DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
-DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
-DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
-DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
-DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
-DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
-%define r7m [esp + stack_offset + 32]
-%define r8m [esp + stack_offset + 36]
+DECLARE_REG 0, eax, eax, ax, al, 4
+DECLARE_REG 1, ecx, ecx, cx, cl, 8
+DECLARE_REG 2, edx, edx, dx, dl, 12
+DECLARE_REG 3, ebx, ebx, bx, bl, 16
+DECLARE_REG 4, esi, esi, si, null, 20
+DECLARE_REG 5, edi, edi, di, null, 24
+DECLARE_REG 6, ebp, ebp, bp, null, 28
%define rsp esp
-%macro PUSH_IF_USED 1 ; reg_id
- %if %1 < regs_used
- push r%1
- %assign stack_offset stack_offset+4
- %endif
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [esp + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
%endmacro
-%macro POP_IF_USED 1 ; reg_id
- %if %1 < regs_used
- pop r%1
- %endif
-%endmacro
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [esp + stack_offset + 4 + %1*4]
- %endif
-%endmacro
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
+ %assign num_args %1
%assign regs_used %2
- ASSERT regs_used <= 7
- PUSH_IF_USED 3
- PUSH_IF_USED 4
- PUSH_IF_USED 5
- PUSH_IF_USED 6
- LOAD_IF_USED 0, %1
- LOAD_IF_USED 1, %1
- LOAD_IF_USED 2, %1
- LOAD_IF_USED 3, %1
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ ASSERT regs_used >= num_args
+ PUSH_IF_USED 3, 4, 5, 6
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
DEFINE_ARGS %4
%endmacro
%macro RET 0
- POP_IF_USED 6
- POP_IF_USED 5
- POP_IF_USED 4
- POP_IF_USED 3
+ POP_IF_USED 6, 5, 4, 3
ret
%endmacro
@@ -454,8 +469,6 @@
%endmacro
%endif
-
-
;=============================================================================
; arch-independent part
;=============================================================================
@@ -784,16 +797,38 @@
%endrep
%undef i
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-avx emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
;%1 == instruction
;%2 == 1 if float, 0 if int
-;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
;%4 == number of operands given
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
- %if sizeof%5==32
- v%1 %5, %6, %7
+ %ifid %5
+ %define %%sizeofreg sizeof%5
+ %elifid %6
+ %define %%sizeofreg sizeof%6
+ %else
+ %define %%sizeofreg mmsize
+ %endif
+ %if %%sizeofreg==32
+ %if %4>=3
+ v%1 %5, %6, %7
+ %else
+ v%1 %5, %6
+ %endif
%else
- %if sizeof%5==8
+ %if %%sizeofreg==8
%define %%regmov movq
%elif %2
%define %%regmov movaps
@@ -803,16 +838,17 @@
%if %4>=3+%3
%ifnidn %5, %6
- %if avx_enabled && sizeof%5==16
+ %if avx_enabled && %%sizeofreg==16
v%1 %5, %6, %7
%else
+ CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
%%regmov %5, %6
%1 %5, %7
%endif
%else
%1 %5, %7
%endif
- %elif %3
+ %elif %4>=3
%1 %5, %6, %7
%else
%1 %5, %6
@@ -820,15 +856,37 @@
%endif
%endmacro
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+ %assign %%swap 0
+ %if avx_enabled
+ %ifnid %6
+ %assign %%swap 1
+ %endif
+ %elifnidn %5, %6
+ %ifnid %7
+ %assign %%swap 1
+ %endif
+ %endif
+ %if %%swap && %3 == 0 && %8 == 1
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+ %else
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+ %endif
+%endmacro
+
;%1 == instruction
;%2 == 1 if float, 0 if int
-;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
-%macro AVX_INSTR 3
- %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+ %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
%ifidn %3, fnord
RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
%elifidn %4, fnord
- RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
+ RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
%elifidn %5, fnord
RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
%else
@@ -837,158 +895,158 @@
%endmacro
%endmacro
-AVX_INSTR addpd, 1, 0
-AVX_INSTR addps, 1, 0
-AVX_INSTR addsd, 1, 0
-AVX_INSTR addss, 1, 0
-AVX_INSTR addsubpd, 1, 0
-AVX_INSTR addsubps, 1, 0
-AVX_INSTR andpd, 1, 0
-AVX_INSTR andps, 1, 0
-AVX_INSTR andnpd, 1, 0
-AVX_INSTR andnps, 1, 0
-AVX_INSTR blendpd, 1, 0
-AVX_INSTR blendps, 1, 0
-AVX_INSTR blendvpd, 1, 0
-AVX_INSTR blendvps, 1, 0
-AVX_INSTR cmppd, 1, 0
-AVX_INSTR cmpps, 1, 0
-AVX_INSTR cmpsd, 1, 0
-AVX_INSTR cmpss, 1, 0
-AVX_INSTR divpd, 1, 0
-AVX_INSTR divps, 1, 0
-AVX_INSTR divsd, 1, 0
-AVX_INSTR divss, 1, 0
-AVX_INSTR dppd, 1, 0
-AVX_INSTR dpps, 1, 0
-AVX_INSTR haddpd, 1, 0
-AVX_INSTR haddps, 1, 0
-AVX_INSTR hsubpd, 1, 0
-AVX_INSTR hsubps, 1, 0
-AVX_INSTR maxpd, 1, 0
-AVX_INSTR maxps, 1, 0
-AVX_INSTR maxsd, 1, 0
-AVX_INSTR maxss, 1, 0
-AVX_INSTR minpd, 1, 0
-AVX_INSTR minps, 1, 0
-AVX_INSTR minsd, 1, 0
-AVX_INSTR minss, 1, 0
-AVX_INSTR movsd, 1, 0
-AVX_INSTR movss, 1, 0
-AVX_INSTR mpsadbw, 0, 1
-AVX_INSTR mulpd, 1, 0
-AVX_INSTR mulps, 1, 0
-AVX_INSTR mulsd, 1, 0
-AVX_INSTR mulss, 1, 0
-AVX_INSTR orpd, 1, 0
-AVX_INSTR orps, 1, 0
-AVX_INSTR packsswb, 0, 0
-AVX_INSTR packssdw, 0, 0
-AVX_INSTR packuswb, 0, 0
-AVX_INSTR packusdw, 0, 0
-AVX_INSTR paddb, 0, 0
-AVX_INSTR paddw, 0, 0
-AVX_INSTR paddd, 0, 0
-AVX_INSTR paddq, 0, 0
-AVX_INSTR paddsb, 0, 0
-AVX_INSTR paddsw, 0, 0
-AVX_INSTR paddusb, 0, 0
-AVX_INSTR paddusw, 0, 0
-AVX_INSTR palignr, 0, 1
-AVX_INSTR pand, 0, 0
-AVX_INSTR pandn, 0, 0
-AVX_INSTR pavgb, 0, 0
-AVX_INSTR pavgw, 0, 0
-AVX_INSTR pblendvb, 0, 0
-AVX_INSTR pblendw, 0, 1
-AVX_INSTR pcmpestri, 0, 0
-AVX_INSTR pcmpestrm, 0, 0
-AVX_INSTR pcmpistri, 0, 0
-AVX_INSTR pcmpistrm, 0, 0
-AVX_INSTR pcmpeqb, 0, 0
-AVX_INSTR pcmpeqw, 0, 0
-AVX_INSTR pcmpeqd, 0, 0
-AVX_INSTR pcmpeqq, 0, 0
-AVX_INSTR pcmpgtb, 0, 0
-AVX_INSTR pcmpgtw, 0, 0
-AVX_INSTR pcmpgtd, 0, 0
-AVX_INSTR pcmpgtq, 0, 0
-AVX_INSTR phaddw, 0, 0
-AVX_INSTR phaddd, 0, 0
-AVX_INSTR phaddsw, 0, 0
-AVX_INSTR phsubw, 0, 0
-AVX_INSTR phsubd, 0, 0
-AVX_INSTR phsubsw, 0, 0
-AVX_INSTR pmaddwd, 0, 0
-AVX_INSTR pmaddubsw, 0, 0
-AVX_INSTR pmaxsb, 0, 0
-AVX_INSTR pmaxsw, 0, 0
-AVX_INSTR pmaxsd, 0, 0
-AVX_INSTR pmaxub, 0, 0
-AVX_INSTR pmaxuw, 0, 0
-AVX_INSTR pmaxud, 0, 0
-AVX_INSTR pminsb, 0, 0
-AVX_INSTR pminsw, 0, 0
-AVX_INSTR pminsd, 0, 0
-AVX_INSTR pminub, 0, 0
-AVX_INSTR pminuw, 0, 0
-AVX_INSTR pminud, 0, 0
-AVX_INSTR pmulhuw, 0, 0
-AVX_INSTR pmulhrsw, 0, 0
-AVX_INSTR pmulhw, 0, 0
-AVX_INSTR pmullw, 0, 0
-AVX_INSTR pmulld, 0, 0
-AVX_INSTR pmuludq, 0, 0
-AVX_INSTR pmuldq, 0, 0
-AVX_INSTR por, 0, 0
-AVX_INSTR psadbw, 0, 0
-AVX_INSTR pshufb, 0, 0
-AVX_INSTR psignb, 0, 0
-AVX_INSTR psignw, 0, 0
-AVX_INSTR psignd, 0, 0
-AVX_INSTR psllw, 0, 0
-AVX_INSTR pslld, 0, 0
-AVX_INSTR psllq, 0, 0
-AVX_INSTR pslldq, 0, 0
-AVX_INSTR psraw, 0, 0
-AVX_INSTR psrad, 0, 0
-AVX_INSTR psrlw, 0, 0
-AVX_INSTR psrld, 0, 0
-AVX_INSTR psrlq, 0, 0
-AVX_INSTR psrldq, 0, 0
-AVX_INSTR psubb, 0, 0
-AVX_INSTR psubw, 0, 0
-AVX_INSTR psubd, 0, 0
-AVX_INSTR psubq, 0, 0
-AVX_INSTR psubsb, 0, 0
-AVX_INSTR psubsw, 0, 0
-AVX_INSTR psubusb, 0, 0
-AVX_INSTR psubusw, 0, 0
-AVX_INSTR punpckhbw, 0, 0
-AVX_INSTR punpckhwd, 0, 0
-AVX_INSTR punpckhdq, 0, 0
-AVX_INSTR punpckhqdq, 0, 0
-AVX_INSTR punpcklbw, 0, 0
-AVX_INSTR punpcklwd, 0, 0
-AVX_INSTR punpckldq, 0, 0
-AVX_INSTR punpcklqdq, 0, 0
-AVX_INSTR pxor, 0, 0
-AVX_INSTR shufps, 0, 1
-AVX_INSTR subpd, 1, 0
-AVX_INSTR subps, 1, 0
-AVX_INSTR subsd, 1, 0
-AVX_INSTR subss, 1, 0
-AVX_INSTR unpckhpd, 1, 0
-AVX_INSTR unpckhps, 1, 0
-AVX_INSTR unpcklpd, 1, 0
-AVX_INSTR unpcklps, 1, 0
-AVX_INSTR xorpd, 1, 0
-AVX_INSTR xorps, 1, 0
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 1, 0
-AVX_INSTR pfsub, 1, 0
-AVX_INSTR pfmul, 1, 0
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
; base-4 constants for shuffles
%assign i 0
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/common/x86/x86util.asm
^
|
@@ -143,6 +143,17 @@
%endif
%endmacro
+%macro WIDEN_SXWD 2
+ punpckhwd m%2, m%1
+ psrad m%2, 16
+%if cpuflag(sse4)
+ pmovsxwd m%1, m%1
+%else
+ punpcklwd m%1, m%1
+ psrad m%1, 16
+%endif
+%endmacro
+
%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
%if cpuflag(ssse3)
pabsw %1, %2
@@ -272,7 +283,7 @@
paddd %1, %2
%endmacro
-%macro HADDW 2
+%macro HADDW 2 ; reg, tmp
%if cpuflag(xop) && mmsize == 16
vphaddwq %1, %1
movhlps %2, %1
@@ -586,7 +597,10 @@
%endmacro
%macro SUMSUB2_AB 4
-%ifnum %3
+%if cpuflag(xop)
+ pmacs%1%1 m%4, m%3, [p%1_m2], m%2
+ pmacs%1%1 m%2, m%2, [p%1_2], m%3
+%elifnum %3
psub%1 m%4, m%2, m%3
psub%1 m%4, m%3
padd%1 m%2, m%2
@@ -600,22 +614,6 @@
%endif
%endmacro
-%macro SUMSUB2_BA 4
-%if avx_enabled
- padd%1 m%4, m%2, m%3
- padd%1 m%4, m%3
- psub%1 m%3, m%2
- psub%1 m%3, m%2
- SWAP %2, %4
-%else
- mova m%4, m%2
- padd%1 m%2, m%3
- padd%1 m%2, m%3
- psub%1 m%3, m%4
- psub%1 m%3, m%4
-%endif
-%endmacro
-
%macro SUMSUBD2_AB 5
%ifnum %4
psra%1 m%5, m%2, 1 ; %3: %3>>1
@@ -697,7 +695,7 @@
%endmacro
%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
-%if cpuflag(ssse3)
+%if BIT_DEPTH == 8 && cpuflag(ssse3)
movh m%2, [%8+%1*FDEC_STRIDE]
movh m%1, [%7+%1*FENC_STRIDE]
punpcklbw m%1, m%2
@@ -715,10 +713,10 @@
pmaddubsw m%3, m%6
pmaddubsw m%4, m%6
%else
- LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
- LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
- LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
- LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
+ LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
+ LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
+ LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
+ LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
%endif
%endmacro
@@ -767,13 +765,24 @@
packuswb %2, %1
%endmacro
-%macro STORE_DIFF 4
+; (high depth) in: %1, %2, min to clip, max to clip, mem128
+; in: %1, tmp, %3, mem64
+%macro STORE_DIFF 4-5
+%ifdef HIGH_BIT_DEPTH
+ psrad %1, 6
+ psrad %2, 6
+ packssdw %1, %2
+ paddw %1, %5
+ CLIPW %1, %3, %4
+ mova %5, %1
+%else
movh %2, %4
punpcklbw %2, %3
psraw %1, 6
paddsw %1, %2
packuswb %1, %1
movh %4, %1
+%endif
%endmacro
%macro SHUFFLE_MASK_W 8
@@ -783,3 +792,12 @@
%rotate 1
%endrep
%endmacro
+
+; instruction, accum, input, iteration (zero to swap, nonzero to add)
+%macro ACCUM 4
+%if %4
+ %1 m%2, m%3
+%else
+ SWAP %2, %3
+%endif
+%endmacro
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/configure
^
|
@@ -212,6 +212,10 @@
rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest*
+SRCPATH="$(cd $(dirname $0); pwd)"
+[ "$SRCPATH" = "$(pwd)" ] && SRCPATH=.
+[ -n "$(echo $SRCPATH | grep ' ')" ] && die "Out of tree builds are impossible with whitespace in source path."
+
prefix='/usr/local'
exec_prefix='${prefix}'
bindir='${exec_prefix}/bin'
@@ -241,7 +245,7 @@
chroma_format="all"
compiler="GNU"
-CFLAGS="$CFLAGS -Wall -I."
+CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
LDFLAGS="$LDFLAGS"
LDFLAGSCLI="$LDFLAGSCLI"
ASFLAGS="$ASFLAGS"
@@ -380,10 +384,10 @@
STRIP="${STRIP-${cross_prefix}strip}"
if [ "x$host" = x ]; then
- host=`./config.guess`
+ host=`${SRCPATH}/config.guess`
fi
# normalize a triplet into a quadruplet
-host=`./config.sub $host`
+host=`${SRCPATH}/config.sub $host`
# split $host
host_cpu="${host%%-*}"
@@ -588,7 +592,7 @@
s390|s390x)
ARCH="S390"
;;
- parisc|parisc64)
+ hppa*|parisc*)
ARCH="PARISC"
;;
ia64)
@@ -626,12 +630,12 @@
fi
fi
-if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" \) ] ; then
+if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then
pic="yes"
fi
if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if ! as_check "vfmaddps xmm0, xmm0, xmm0, xmm0" ; then
+ if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then
VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
echo "Found $VER"
echo "Minimum version is yasm-1.0.0"
@@ -764,10 +768,10 @@
[ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil"
if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then
- if cc_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "av_get_pix_fmt_name(0);" ; then
+ if cpp_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "defined(PIX_FMT_RGB)" ; then
swscale="yes"
else
- echo "Warning: av_get_pix_fmt_name is missing from libavutil, update for swscale support"
+ echo "Warning: PIX_FMT_RGB is missing from libavutil, update for swscale support"
fi
fi
fi
@@ -795,7 +799,7 @@
fi
if [ "$ffms" = "auto" ] ; then
- ffms_major="2"; ffms_minor="14"; ffms_micro="0"; ffms_bump="0"
+ ffms_major="2"; ffms_minor="16"; ffms_micro="2"; ffms_bump="0"
ffms="no"
if ${cross_prefix}pkg-config --exists ffms2 2>/dev/null; then
@@ -997,6 +1001,7 @@
# generate config files
cat > config.mak << EOF
+SRCPATH=$SRCPATH
prefix=$prefix
exec_prefix=$exec_prefix
bindir=$bindir
@@ -1036,7 +1041,7 @@
fi
if [ "$shared" = "yes" ]; then
- API=$(grep '#define X264_BUILD' < x264.h | cut -f 3 -d ' ')
+ API=$(grep '#define X264_BUILD' < ${SRCPATH}/x264.h | cut -f 3 -d ' ')
if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then
echo "SONAME=libx264-$API.dll" >> config.mak
if [ $compiler = ICL ]; then
@@ -1087,7 +1092,7 @@
echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak
echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak
-./version.sh >> x264_config.h
+${SRCPATH}/version.sh "${SRCPATH}" >> x264_config.h
pclibs="-L$libdir -lx264 $libpthread"
@@ -1139,6 +1144,9 @@
cat conftest.log
rm conftest.log
+[ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile
+mkdir -p common/{arm,ppc,sparc,x86} encoder extras filters/video input output tools
+
echo
echo "You can run 'make' or 'make fprofiled' now."
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/analyse.c
^
|
@@ -433,8 +433,10 @@
a->i_satd_i4x4 =
a->i_satd_chroma = COST_MAX;
- /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
- a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
+ /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
+ * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
+ uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
+ a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
a->b_fast_intra = 0;
a->b_avoid_topright = 0;
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/cabac.c
^
|
@@ -167,7 +167,12 @@
if( i_dqp != 0 )
{
- int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1);
+ /* Faster than (i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp-1)).
+ * If you so much as sneeze on these lines, gcc will compile this suboptimally. */
+ i_dqp *= 2;
+ int val = 1 - i_dqp;
+ if( val < 0 ) val = i_dqp;
+ val--;
/* dqp is interpreted modulo (QP_MAX_SPEC+1) */
if( val >= QP_MAX_SPEC && val != QP_MAX_SPEC+1 )
val = 2*QP_MAX_SPEC+1 - val;
@@ -289,8 +294,8 @@
x264_cabac_encode_decision( cb, ctxbase + 5, 1 );
if( i_abs < 9 )
{
- cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
- cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
+ cb->f8_bits_encoded += x264_cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
+ cb->state[ctxbase+6] = x264_cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
}
else
{
@@ -653,7 +658,12 @@
{
227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
};
-static const uint8_t significant_coeff_flag_offset_8x8[2][63] =
+#if RDO_SKIP_BS
+extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][63];
+extern const uint8_t x264_last_coeff_flag_offset_8x8[63];
+extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7];
+#else
+const uint8_t x264_significant_coeff_flag_offset_8x8[2][63] =
{{
0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
@@ -665,14 +675,15 @@
9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14
}};
-static const uint8_t last_coeff_flag_offset_8x8[63] =
+const uint8_t x264_last_coeff_flag_offset_8x8[63] =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
};
-static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
+const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
+#endif
// node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
// 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
@@ -732,15 +743,15 @@
if( chroma422dc )
{
int count_m1 = 7;
- WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] )
+ WRITE_SIGMAP( x264_coeff_flag_offset_chroma_422_dc[i], x264_coeff_flag_offset_chroma_422_dc[i] )
}
else
{
int count_m1 = count_cat_m1[ctx_block_cat];
if( count_m1 == 63 )
{
- const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
- WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] )
+ const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
+ WRITE_SIGMAP( sig_offset[i], x264_last_coeff_flag_offset_8x8[i] )
}
else
WRITE_SIGMAP( i, i )
@@ -794,7 +805,7 @@
* is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */
static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
{
- const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
+ const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
@@ -807,9 +818,9 @@
if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
{
x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
- chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
- x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] :
- chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
+ chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
+ x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[last] :
+ chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
}
if( coeff_abs > 1 )
@@ -818,13 +829,13 @@
ctx = levelgt1_ctx[0] + ctx_level;
if( coeff_abs < 15 )
{
- cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
- cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
+ cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]];
+ cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
}
else
{
- cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]];
- cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]];
+ cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]];
+ cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]];
x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
}
node_ctx = coeff_abs_level_transition[1][0];
@@ -842,9 +853,9 @@
{
coeff_abs = abs(l[i]);
x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
- chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 );
- x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] :
- chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
+ chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 1 );
+ x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[i] :
+ chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 );
ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;
if( coeff_abs > 1 )
@@ -853,13 +864,13 @@
ctx = levelgt1_ctx[node_ctx] + ctx_level;
if( coeff_abs < 15 )
{
- cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
- cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
+ cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]];
+ cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
}
else
{
- cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]];
- cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]];
+ cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]];
+ cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]];
x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
}
node_ctx = coeff_abs_level_transition[1][node_ctx];
@@ -873,7 +884,7 @@
}
else
x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
- chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
+ chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 );
}
}
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/cavlc.c
^
|
@@ -132,6 +132,7 @@
runlevel.level[1] = 2;
runlevel.level[2] = 2;
i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
+ x264_prefetch( &x264_run_before[runlevel.mask] );
i_total_zero = runlevel.last + 1 - i_total;
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
@@ -188,12 +189,8 @@
else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
- for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
- {
- int i_zl = X264_MIN( i_total_zero, 7 );
- bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
- i_total_zero -= runlevel.run[i];
- }
+ int zero_run_code = x264_run_before[runlevel.mask];
+ bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
return i_total;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/encoder.c
^
|
@@ -61,7 +61,11 @@
static double x264_ssim( double ssim )
{
- return -10.0 * log10( 1 - ssim );
+ double inv_ssim = 1 - ssim;
+ if( inv_ssim <= 0.0000000001 ) /* Max 100dB */
+ return 100;
+
+ return -10.0 * log10( inv_ssim );
}
static void x264_frame_dump( x264_t *h )
@@ -472,7 +476,6 @@
if( h->param.i_threads == X264_THREADS_AUTO )
h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
- h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
if( h->param.i_threads > 1 )
{
#if !HAVE_THREAD
@@ -487,7 +490,8 @@
h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
}
}
- else
+ h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+ if( h->param.i_threads == 1 )
h->param.b_sliced_threads = 0;
h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
if( h->i_thread_frames > 1 )
@@ -1169,10 +1173,6 @@
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
- if( h->param.b_cabac )
- x264_cabac_init( h );
- else
- x264_cavlc_init();
x264_pixel_init( h->param.cpu, &h->pixf );
x264_dct_init( h->param.cpu, &h->dctf );
x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
@@ -1181,7 +1181,10 @@
x264_quant_init( h, h->param.cpu, &h->quantf );
x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
x264_bitstream_init( h->param.cpu, &h->bsf );
- x264_dct_init_weights();
+ if( h->param.b_cabac )
+ x264_cabac_init( h );
+ else
+ x264_cavlc_init( h );
mbcmp_init( h );
chroma_dsp_init( h );
@@ -3108,6 +3111,8 @@
if( pic_out->i_pts < pic_out->i_dts )
x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" );
+ pic_out->opaque = h->fenc->opaque;
+
pic_out->img.i_csp = h->fdec->i_csp;
#if HIGH_BIT_DEPTH
pic_out->img.i_csp |= X264_CSP_HIGH_DEPTH;
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/macroblock.c
^
|
@@ -1090,7 +1090,7 @@
{
int dct8x8 = cat&1;
int size = dct8x8 ? 64 : 16;
- const uint16_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
+ const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
{
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/ratecontrol.c
^
|
@@ -2385,7 +2385,7 @@
}
}
-void x264_threads_normalize_predictors( x264_t *h )
+static void x264_threads_normalize_predictors( x264_t *h )
{
double totalsize = 0;
for( int i = 0; i < h->param.i_threads; i++ )
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/rdo.c
^
|
@@ -32,8 +32,8 @@
/* Transition and size tables for abs<9 MVD and residual coding */
/* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
-static uint8_t cabac_transition_unary[15][128];
-static uint16_t cabac_size_unary[15][128];
+uint8_t x264_cabac_transition_unary[15][128];
+uint16_t x264_cabac_size_unary[15][128];
/* Transition and size tables for abs>9 MVD */
/* Consist of 5 1s and a bypass sign bit */
static uint8_t cabac_transition_5ones[128];
@@ -365,9 +365,9 @@
* Trellis RD quantization
****************************************************************************/
-#define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
+#define TRELLIS_SCORE_MAX -1LL // negative marks the node as invalid
+#define TRELLIS_SCORE_BIAS 1LL<<60; // bias so that all valid scores are positive, even after negative contributions from psy
#define CABAC_SIZE_BITS 8
-#define SSD_WEIGHT_BITS 5
#define LAMBDA_BITS 4
/* precalculate the cost of coding various combinations of bits in a single context */
@@ -386,8 +386,8 @@
f8_bits += x264_cabac_size_decision2( &ctx, 0 );
f8_bits += 1 << CABAC_SIZE_BITS; //sign
- cabac_size_unary[i_prefix][i_ctx] = f8_bits;
- cabac_transition_unary[i_prefix][i_ctx] = ctx;
+ x264_cabac_size_unary[i_prefix][i_ctx] = f8_bits;
+ x264_cabac_transition_unary[i_prefix][i_ctx] = ctx;
}
}
for( int i_ctx = 0; i_ctx < 128; i_ctx++ )
@@ -406,11 +406,17 @@
typedef struct
{
- int64_t score;
+ uint64_t score;
int level_idx; // index into level_tree[]
- uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
+ uint8_t cabac_state[4]; // just contexts 0,4,8,9 of the 10 relevant to coding abs_level_m1
} trellis_node_t;
+typedef struct
+{
+ uint16_t next;
+ uint16_t abs_level;
+} trellis_level_t;
+
// TODO:
// save cabac state between blocks?
// use trellis' RD score instead of x264_mb_decimate_score?
@@ -431,68 +437,307 @@
// comparable to the input. so unquant is the direct inverse of quant,
// and uses the dct scaling factors, not the idct ones.
+#define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
+
+#define SET_LEVEL(ndst, nsrc, l) {\
+ if( sizeof(trellis_level_t) == sizeof(uint32_t) )\
+ M32( &level_tree[levels_used] ) = pack16to32( nsrc.level_idx, l );\
+ else\
+ level_tree[levels_used] = (trellis_level_t){ nsrc.level_idx, l };\
+ ndst.level_idx = levels_used;\
+ levels_used++;\
+}
+
+// encode all values of the dc coef in a block which is known to have no ac
+static NOINLINE
+int trellis_dc_shortcut( int sign_coef, int quant_coef, int unquant_mf, int coef_weight, int lambda2, uint8_t *cabac_state, int cost_sig )
+{
+ uint64_t bscore = TRELLIS_SCORE_MAX;
+ int ret = 0;
+ int q = abs( quant_coef );
+ for( int abs_level = q-1; abs_level <= q; abs_level++ )
+ {
+ int unquant_abs_level = (unquant_mf * abs_level + 128) >> 8;
+
+ /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
+ int d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);
+ uint64_t score = (uint64_t)d*d * coef_weight;
+
+ /* code the proposed level, and count how much entropy it would take */
+ if( abs_level )
+ {
+ unsigned f8_bits = cost_sig;
+ int prefix = X264_MIN( abs_level - 1, 14 );
+ f8_bits += x264_cabac_size_decision_noup2( cabac_state+1, prefix > 0 );
+ f8_bits += x264_cabac_size_unary[prefix][cabac_state[5]];
+ if( abs_level >= 15 )
+ f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
+ score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+ }
+
+ COPY2_IF_LT( bscore, score, ret, abs_level );
+ }
+ return SIGN(ret, sign_coef);
+}
+
+// encode one value of one coef in one context
+static ALWAYS_INLINE
+int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_cost,
+ int node_ctx, int level1_ctx, int levelgt1_ctx, uint64_t ssd, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state )
+{
+ uint64_t score = nodes_prev[j].score + ssd;
+ /* code the proposed level, and count how much entropy it would take */
+ unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
+ uint8_t level1_state = (j >= 3) ? nodes_prev[j].cabac_state[level1_ctx>>2] : level_state[level1_ctx];
+ f8_bits += x264_cabac_entropy[level1_state ^ (const_level > 1)];
+ uint8_t levelgt1_state;
+ if( const_level > 1 )
+ {
+ levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx];
+ f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost;
+ }
+ else
+ f8_bits += 1 << CABAC_SIZE_BITS;
+ score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+
+ /* save the node if it's better than any existing node with the same cabac ctx */
+ if( score < nodes_cur[node_ctx].score )
+ {
+ nodes_cur[node_ctx].score = score;
+ if( j == 2 || (j <= 3 && node_ctx == 4) ) // init from input state
+ M32(nodes_cur[node_ctx].cabac_state) = M32(level_state+12);
+ else if( j >= 3 )
+ M32(nodes_cur[node_ctx].cabac_state) = M32(nodes_prev[j].cabac_state);
+ if( j >= 3 ) // skip the transition if we're not going to reuse the context
+ nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1];
+ if( const_level > 1 && node_ctx == 7 )
+ nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state];
+ nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx;
+ SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level );
+ }
+ return levels_used;
+}
+
+// encode one value of one coef in all contexts, templated by which value that is.
+// in ctx_lo, the set of live nodes is contiguous and starts at ctx0, so return as soon as we've seen one failure.
+// in ctx_hi, they're contiguous within each block of 4 ctxs, but not necessarily starting at the beginning,
+// so exploiting that would be more complicated.
+static NOINLINE
+int trellis_coef0_0( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ nodes_cur[0].score = nodes_prev[0].score + ssd0;
+ nodes_cur[0].level_idx = nodes_prev[0].level_idx;
+ for( int j = 1; j < 4 && (int64_t)nodes_prev[j].score >= 0; j++ )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef0_1( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ for( int j = 1; j < 8; j++ )
+ // this branch only affects speed, not function; there's nothing wrong with updating invalid nodes in coef0.
+ if( (int64_t)nodes_prev[j].score >= 0 )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+#define COEF(const_level, ctx_hi, j, ...)\
+ if( !j || (int64_t)nodes_prev[j].score >= 0 )\
+ levels_used = trellis_coef( j, const_level, abs_level, prefix, suffix_cost, __VA_ARGS__,\
+ j?ssd1:ssd0, cost_siglast, nodes_cur, nodes_prev,\
+ level_tree, levels_used, lambda2, level_state );\
+ else if( !ctx_hi )\
+ return levels_used;
+
+static NOINLINE
+int trellis_coef1_0( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 0, 0, 1, 1, 0 );
+ COEF( 1, 0, 1, 2, 2, 0 );
+ COEF( 1, 0, 2, 3, 3, 0 );
+ COEF( 1, 0, 3, 3, 4, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef1_1( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 1, 1, 2, 2, 0 );
+ COEF( 1, 1, 2, 3, 3, 0 );
+ COEF( 1, 1, 3, 3, 4, 0 );
+ COEF( 1, 1, 4, 4, 0, 0 );
+ COEF( 1, 1, 5, 5, 0, 0 );
+ COEF( 1, 1, 6, 6, 0, 0 );
+ COEF( 1, 1, 7, 7, 0, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_0( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 0, 0, 4, 1, 5 );
+ COEF( 2, 0, 1, 4, 2, 5 );
+ COEF( 2, 0, 2, 4, 3, 5 );
+ COEF( 2, 0, 3, 4, 4, 5 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_1( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 1, 1, 4, 2, 5 );
+ COEF( 2, 1, 2, 4, 3, 5 );
+ COEF( 2, 1, 3, 4, 4, 5 );
+ COEF( 2, 1, 4, 5, 0, 6 );
+ COEF( 2, 1, 5, 6, 0, 7 );
+ COEF( 2, 1, 6, 7, 0, 8 );
+ COEF( 2, 1, 7, 7, 0, levelgt1_ctx );
+ return levels_used;
+}
+
static ALWAYS_INLINE
int quant_trellis_cabac( x264_t *h, dctcoef *dct,
- const udctcoef *quant_mf, const int *unquant_mf,
- const uint16_t *coef_weight, const uint8_t *zigzag,
- int ctx_block_cat, int i_lambda2, int b_ac,
- int b_chroma, int dc, int i_coefs, int idx )
+ udctcoef *quant_mf, udctcoef *quant_bias, const int *unquant_mf,
+ const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
+ int b_chroma, int dc, int num_coefs, int idx )
{
- udctcoef abs_coefs[64];
- int8_t signs[64];
- trellis_node_t nodes[2][8];
- trellis_node_t *nodes_cur = nodes[0];
- trellis_node_t *nodes_prev = nodes[1];
- trellis_node_t *bnode;
+ ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] );
+ ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] );
+ const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
+ const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
- const uint8_t *levelgt1_ctx = b_chroma && dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
- const int f = 1 << 15; // no deadzone
- int i_last_nnz;
- int i;
+ int levelgt1_ctx = b_chroma && dc ? 8 : 9;
- // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
- // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
- // but it takes more time to remove dead states than you gain in reduced memory.
- struct
+ if( dc )
{
- uint16_t abs_level;
- uint16_t next;
- } level_tree[64*8*2];
- int i_levels_used = 1;
-
- /* init coefs */
- for( i = i_coefs-1; i >= b_ac; i-- )
- if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
- break;
-
- if( i < b_ac )
+ if( num_coefs == 16 )
+ {
+ memcpy( orig_coefs, dct, sizeof(dctcoef)*16 );
+ if( !h->quantf.quant_4x4_dc( dct, quant_mf[0] >> 1, quant_bias[0] << 1 ) )
+ return 0;
+ h->zigzagf.scan_4x4( quant_coefs, dct );
+ }
+ else
+ {
+ memcpy( orig_coefs, dct, sizeof(dctcoef)*num_coefs );
+ int nz = h->quantf.quant_2x2_dc( &dct[0], quant_mf[0] >> 1, quant_bias[0] << 1 );
+ if( num_coefs == 8 )
+ nz |= h->quantf.quant_2x2_dc( &dct[4], quant_mf[0] >> 1, quant_bias[0] << 1 );
+ if( !nz )
+ return 0;
+ for( int i = 0; i < num_coefs; i++ )
+ quant_coefs[i] = dct[zigzag[i]];
+ }
+ }
+ else
{
- /* We only need to zero an empty 4x4 block. 8x8 can be
- implicitly emptied via zero nnz, as can dc. */
- if( i_coefs == 16 && !dc )
- memset( dct, 0, 16 * sizeof(dctcoef) );
- return 0;
+ if( num_coefs == 64 )
+ {
+ h->mc.memcpy_aligned( orig_coefs, dct, sizeof(dctcoef)*64 );
+ if( !h->quantf.quant_8x8( dct, quant_mf, quant_bias ) )
+ return 0;
+ h->zigzagf.scan_8x8( quant_coefs, dct );
+ }
+ else //if( num_coefs == 16 )
+ {
+ memcpy( orig_coefs, dct, sizeof(dctcoef)*16 );
+ if( !h->quantf.quant_4x4( dct, quant_mf, quant_bias ) )
+ return 0;
+ h->zigzagf.scan_4x4( quant_coefs, dct );
+ }
}
- i_last_nnz = i;
- idx &= i_coefs == 64 ? 3 : 15;
+ int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac;
+ uint8_t *cabac_state = &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ];
- for( ; i >= b_ac; i-- )
- {
- int coef = dct[zigzag[i]];
- abs_coefs[i] = abs(coef);
- signs[i] = coef>>31 | 1;
- }
+ /* shortcut for dc-only blocks.
+ * this doesn't affect the output, but saves some unnecessary computation. */
+ if( last_nnz == 0 && !dc )
+ {
+ int cost_sig = x264_cabac_size_decision_noup2( &cabac_state_sig[0], 1 )
+ + x264_cabac_size_decision_noup2( &cabac_state_last[0], 1 );
+ dct[0] = trellis_dc_shortcut( orig_coefs[0], quant_coefs[0], unquant_mf[0], coef_weight2[0], lambda2, cabac_state, cost_sig );
+ return !!dct[0];
+ }
+
+#if HAVE_MMX && ARCH_X86_64
+#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
+ cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
+ if( num_coefs == 16 && !dc )
+ if( b_chroma || !h->mb.i_psy_trellis )
+ return h->quantf.trellis_cabac_4x4( TRELLIS_ARGS, b_ac );
+ else
+ return h->quantf.trellis_cabac_4x4_psy( TRELLIS_ARGS, b_ac, h->mb.pic.fenc_dct4[idx&15], h->mb.i_psy_trellis );
+ else if( num_coefs == 64 && !dc )
+ if( b_chroma || !h->mb.i_psy_trellis )
+ return h->quantf.trellis_cabac_8x8( TRELLIS_ARGS, b_interlaced );
+ else
+ return h->quantf.trellis_cabac_8x8_psy( TRELLIS_ARGS, b_interlaced, h->mb.pic.fenc_dct8[idx&3], h->mb.i_psy_trellis);
+ else if( num_coefs == 8 && dc )
+ return h->quantf.trellis_cabac_chroma_422_dc( TRELLIS_ARGS );
+ else if( dc )
+ return h->quantf.trellis_cabac_dc( TRELLIS_ARGS, num_coefs-1 );
+#endif
+ // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
+ // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
+ // but it takes more time to remove dead states than you gain in reduced memory.
+ trellis_level_t level_tree[64*8*2];
+ int levels_used = 1;
/* init trellis */
- for( int j = 1; j < 8; j++ )
+ trellis_node_t nodes[2][8];
+ trellis_node_t *nodes_cur = nodes[0];
+ trellis_node_t *nodes_prev = nodes[1];
+ trellis_node_t *bnode;
+ for( int j = 1; j < 4; j++ )
nodes_cur[j].score = TRELLIS_SCORE_MAX;
- nodes_cur[0].score = 0;
+ nodes_cur[0].score = TRELLIS_SCORE_BIAS;
nodes_cur[0].level_idx = 0;
level_tree[0].abs_level = 0;
level_tree[0].next = 0;
+ ALIGNED_4( uint8_t level_state[16] );
+ memcpy( level_state, cabac_state, 10 );
+ level_state[12] = cabac_state[0]; // packed subset for copying into trellis_node_t
+ level_state[13] = cabac_state[4];
+ level_state[14] = cabac_state[8];
+ level_state[15] = cabac_state[9];
+
+ idx &= num_coefs == 64 ? 3 : 15;
// coefs are processed in reverse order, because that's how the abs value is coded.
// last_coef and significant_coef flags are normally coded in forward order, but
@@ -501,160 +746,135 @@
// position, so the order doesn't matter, and we don't even have to update their contexts.
// in 8x8 blocks, some positions share contexts, so we'll just have to hope that
// cabac isn't too sensitive.
-
- memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ], 10 );
-
- for( i = i_last_nnz; i >= b_ac; i-- )
- {
- int i_coef = abs_coefs[i];
- int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
- int cost_sig[2], cost_last[2];
- trellis_node_t n;
-
- // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
- if( q == 0 )
- {
- // no need to calculate ssd of 0s: it's the same in all nodes.
- // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
- int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
- b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
- const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
- * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
- for( int j = 1; j < 8; j++ )
- {
- if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
- {
-#define SET_LEVEL(n,l) \
- level_tree[i_levels_used].abs_level = l; \
- level_tree[i_levels_used].next = n.level_idx; \
- n.level_idx = i_levels_used; \
- i_levels_used++;
-
- SET_LEVEL( nodes_cur[j], 0 );
- nodes_cur[j].score += cost_sig0;
- }
- }
- continue;
- }
-
- XCHG( trellis_node_t*, nodes_cur, nodes_prev );
-
- for( int j = 0; j < 8; j++ )
- nodes_cur[j].score = TRELLIS_SCORE_MAX;
-
- if( i < i_coefs-1 )
- {
- int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
- b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
- int lastindex = !dc && i_coefs == 64 ? last_coeff_flag_offset_8x8[i] :
- b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
- cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
- cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
- cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 );
- cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 );
- }
- else
- {
- cost_sig[0] = cost_sig[1] = 0;
- cost_last[0] = cost_last[1] = 0;
- }
-
- // there are a few cases where increasing the coeff magnitude helps,
- // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
- // could also try q-2: that sometimes helps, but also sometimes decimates blocks
- // that are better left coded, especially at QP > 40.
- for( int abs_level = q; abs_level >= q-1; abs_level-- )
- {
- int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
- int d = i_coef - unquant_abs_level;
- int64_t ssd;
- /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
- if( h->mb.i_psy_trellis && i && !dc && !b_chroma )
- {
- int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];
- int predicted_coef = orig_coef - i_coef * signs[i];
- int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
- int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
- ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
- }
- else
- /* FIXME: for i16x16 dc is this weight optimal? */
- ssd = (int64_t)d*d * (dc?256:coef_weight[i]);
-
- for( int j = 0; j < 8; j++ )
- {
- int node_ctx = j;
- if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
- continue;
- n = nodes_prev[j];
-
- /* code the proposed level, and count how much entropy it would take */
- if( abs_level || node_ctx )
- {
- unsigned f8_bits = cost_sig[ abs_level != 0 ];
- if( abs_level )
- {
- const int i_prefix = X264_MIN( abs_level - 1, 14 );
- f8_bits += cost_last[ node_ctx == 0 ];
- f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
- if( i_prefix > 0 )
- {
- uint8_t *ctx = &n.cabac_state[levelgt1_ctx[node_ctx]];
- f8_bits += cabac_size_unary[i_prefix][*ctx];
- *ctx = cabac_transition_unary[i_prefix][*ctx];
- if( abs_level >= 15 )
- f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
- node_ctx = coeff_abs_level_transition[1][node_ctx];
- }
- else
- {
- f8_bits += 1 << CABAC_SIZE_BITS;
- node_ctx = coeff_abs_level_transition[0][node_ctx];
- }
- }
- n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
- }
-
- if( j || i || dc )
- n.score += ssd;
- /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
- else
- {
- d = i_coef * signs[0] - ((unquant_abs_level * signs[0] + 8)&~15);
- n.score += (int64_t)d*d * coef_weight[i];
- }
-
- /* save the node if it's better than any existing node with the same cabac ctx */
- if( n.score < nodes_cur[node_ctx].score )
- {
- SET_LEVEL( n, abs_level );
- nodes_cur[node_ctx] = n;
- }
- }
- }
- }
-
- /* output levels from the best path through the trellis */
- bnode = &nodes_cur[0];
- for( int j = 1; j < 8; j++ )
- if( nodes_cur[j].score < bnode->score )
+ int i = last_nnz;
+#define TRELLIS_LOOP(ctx_hi)\
+ for( ; i >= b_ac; i-- )\
+ {\
+ /* skip 0s: this doesn't affect the output, but saves some unnecessary computation. */\
+ if( !quant_coefs[i] )\
+ {\
+ /* no need to calculate ssd of 0s: it's the same in all nodes.\
+ * no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
+ * subtracting from one score is equivalent to adding to the rest. */\
+ if( !ctx_hi )\
+ {\
+ int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
+ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
+ uint64_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )\
+ * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\
+ nodes_cur[0].score -= cost_sig0;\
+ }\
+ for( int j = 1; j < (ctx_hi?8:4); j++ )\
+ SET_LEVEL( nodes_cur[j], nodes_cur[j], 0 );\
+ continue;\
+ }\
+\
+ int sign_coef = orig_coefs[zigzag[i]];\
+ int abs_coef = abs( sign_coef );\
+ int q = abs( quant_coefs[i] );\
+ int cost_siglast[3]; /* { zero, nonzero, nonzero-and-last } */\
+ XCHG( trellis_node_t*, nodes_cur, nodes_prev );\
+ for( int j = ctx_hi; j < 8; j++ )\
+ nodes_cur[j].score = TRELLIS_SCORE_MAX;\
+\
+ if( i < num_coefs-1 || ctx_hi )\
+ {\
+ int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
+ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
+ int lastindex = !dc && num_coefs == 64 ? x264_last_coeff_flag_offset_8x8[i] :\
+ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
+ cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );\
+ int cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );\
+ cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;\
+ if( !ctx_hi )\
+ cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;\
+ }\
+ else\
+ {\
+ cost_siglast[0] = cost_siglast[1] = cost_siglast[2] = 0;\
+ }\
+\
+ /* there are a few cases where increasing the coeff magnitude helps,\
+ * but it's only around .003 dB, and skipping them ~doubles the speed of trellis.\
+ * could also try q-2: that sometimes helps, but also sometimes decimates blocks\
+ * that are better left coded, especially at QP > 40. */\
+ uint64_t ssd0[2], ssd1[2];\
+ for( int k = 0; k < 2; k++ )\
+ {\
+ int abs_level = q-1+k;\
+ int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);\
+ int d = abs_coef - unquant_abs_level;\
+ /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */\
+ if( h->mb.i_psy_trellis && i && !dc && !b_chroma )\
+ {\
+ int orig_coef = (num_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];\
+ int predicted_coef = orig_coef - sign_coef;\
+ int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));\
+ int psy_weight = coef_weight1[zigzag[i]] * h->mb.i_psy_trellis;\
+ ssd1[k] = (uint64_t)d*d * coef_weight2[zigzag[i]] - psy_weight * psy_value;\
+ }\
+ else\
+ /* FIXME: for i16x16 dc is this weight optimal? */\
+ ssd1[k] = (uint64_t)d*d * (dc?256:coef_weight2[zigzag[i]]);\
+ ssd0[k] = ssd1[k];\
+ if( !i && !dc && !ctx_hi )\
+ {\
+ /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */\
+ d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);\
+ ssd0[k] = (uint64_t)d*d * coef_weight2[zigzag[i]];\
+ }\
+ }\
+\
+ /* argument passing imposes some significant overhead here. gcc's interprocedural register allocation isn't up to it. */\
+ switch( q )\
+ {\
+ case 1:\
+ ssd1[0] += (uint64_t)cost_siglast[0] * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\
+ levels_used = trellis_coef0_##ctx_hi( ssd0[0]-ssd1[0], nodes_cur, nodes_prev, level_tree, levels_used );\
+ levels_used = trellis_coef1_##ctx_hi( ssd0[1]-ssd1[0], ssd1[1]-ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\
+ goto next##ctx_hi;\
+ case 2:\
+ levels_used = trellis_coef1_##ctx_hi( ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\
+ levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
+ goto next1;\
+ default:\
+ levels_used = trellis_coefn_##ctx_hi( q-1, ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
+ levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
+ goto next1;\
+ }\
+ next##ctx_hi:;\
+ }\
+ /* output levels from the best path through the trellis */\
+ bnode = &nodes_cur[ctx_hi];\
+ for( int j = ctx_hi+1; j < (ctx_hi?8:4); j++ )\
+ if( nodes_cur[j].score < bnode->score )\
bnode = &nodes_cur[j];
+ // keep 2 versions of the main quantization loop, depending on which subsets of the node_ctxs are live
+ // node_ctx 0..3, i.e. having not yet encountered any coefs that might be quantized to >1
+ TRELLIS_LOOP(0);
+
if( bnode == &nodes_cur[0] )
{
- if( i_coefs == 16 && !dc )
+ /* We only need to zero an empty 4x4 block. 8x8 can be
+ implicitly emptied via zero nnz, as can dc. */
+ if( num_coefs == 16 && !dc )
memset( dct, 0, 16 * sizeof(dctcoef) );
return 0;
}
+ if(0) // accessible only by goto, not fallthrough
+ {
+ // node_ctx 1..7 (ctx0 ruled out because we never try both level0 and level2+ on the same coef)
+ TRELLIS_LOOP(1);
+ }
+
int level = bnode->level_idx;
- for( i = b_ac; level; i++ )
+ for( i = b_ac; i <= last_nnz; i++ )
{
- dct[zigzag[i]] = level_tree[level].abs_level * signs[i];
+ dct[zigzag[i]] = SIGN(level_tree[level].abs_level, dct[zigzag[i]]);
level = level_tree[level].next;
}
- for( ; i < i_coefs; i++ )
- dct[zigzag[i]] = 0;
return 1;
}
@@ -685,24 +905,25 @@
static ALWAYS_INLINE
int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
const udctcoef *quant_mf, const int *unquant_mf,
- const uint16_t *coef_weight, const uint8_t *zigzag,
- int ctx_block_cat, int i_lambda2, int b_ac,
- int b_chroma, int dc, int i_coefs, int idx, int b_8x8 )
+ const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
+ int b_chroma, int dc, int num_coefs, int idx, int b_8x8 )
{
ALIGNED_16( dctcoef quant_coefs[2][16] );
ALIGNED_16( dctcoef coefs[16] ) = {0};
+ const uint32_t *coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
+ const uint32_t *coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
int delta_distortion[16];
int64_t score = 1ULL<<62;
int i, j;
const int f = 1<<15;
- int nC = b_chroma && dc ? 3 + (i_coefs>>2)
+ int nC = b_chroma && dc ? 3 + (num_coefs>>2)
: ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )];
/* Code for handling 8x8dct -> 4x4dct CAVLC munging. Input/output use a different
* step/start/end than internal processing. */
int step = 1;
int start = b_ac;
- int end = i_coefs - 1;
+ int end = num_coefs - 1;
if( b_8x8 )
{
start = idx&3;
@@ -711,7 +932,7 @@
}
idx &= 15;
- i_lambda2 <<= LAMBDA_BITS;
+ lambda2 <<= LAMBDA_BITS;
/* Find last non-zero coefficient. */
for( i = end; i >= start; i -= step )
@@ -726,10 +947,10 @@
*
* We only search two roundings (nearest and nearest-1) like in CABAC trellis,
* so we just store the difference in distortion between them. */
- int i_last_nnz = b_8x8 ? i >> 2 : i;
+ int last_nnz = b_8x8 ? i >> 2 : i;
int coef_mask = 0;
int round_mask = 0;
- for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step )
+ for( i = b_ac, j = start; i <= last_nnz; i++, j += step )
{
int coef = dct[zigzag[j]];
int abs_coef = abs(coef);
@@ -748,14 +969,14 @@
int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-1) + 128) >> 8);
int d1 = abs_coef - unquant1;
int d0 = abs_coef - unquant0;
- delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight[j]);
+ delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight2[zigzag[j]]);
/* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
if( h->mb.i_psy_trellis && j && !dc && !b_chroma )
{
int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]];
int predicted_coef = orig_coef - coef;
- int psy_weight = b_8x8 ? x264_dct8_weight_tab[zigzag[j]] : x264_dct4_weight_tab[zigzag[j]];
+ int psy_weight = coef_weight1[zigzag[j]];
int psy_value0 = h->mb.i_psy_trellis * abs(predicted_coef + unquant0 * sign);
int psy_value1 = h->mb.i_psy_trellis * abs(predicted_coef + unquant1 * sign);
delta_distortion[i] += (psy_value0 - psy_value1) * psy_weight;
@@ -778,7 +999,7 @@
bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
else
x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC );
- score = (int64_t)h->out.bs.i_bits_encoded * i_lambda2;
+ score = (int64_t)h->out.bs.i_bits_encoded * lambda2;
/* QNS loop: pick the change that improves RD the most, apply it, repeat.
* coef_mask and round_mask are used to simplify tracking of nonzeroness
@@ -790,7 +1011,7 @@
int iter_coef = -1;
int iter_mask = coef_mask;
int iter_round = round_mask;
- for( i = b_ac; i <= i_last_nnz; i++ )
+ for( i = b_ac; i <= last_nnz; i++ )
{
if( !delta_distortion[i] )
continue;
@@ -811,7 +1032,7 @@
bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
else
x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC );
- cur_score += (int64_t)h->out.bs.i_bits_encoded * i_lambda2;
+ cur_score += (int64_t)h->out.bs.i_bits_encoded * lambda2;
coefs[i] = old_coef;
if( cur_score < iter_score )
@@ -839,10 +1060,8 @@
if( coef_mask )
{
- for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step )
+ for( i = b_ac, j = start; i < num_coefs; i++, j += step )
dct[zigzag[j]] = coefs[i];
- for( ; j <= end; j += step )
- dct[zigzag[j]] = 0;
return 1;
}
@@ -862,11 +1081,12 @@
{
if( h->param.b_cabac )
return quant_trellis_cabac( h, dct,
- h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+ h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp],
+ h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED],
ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx );
return quant_trellis_cavlc( h, dct,
- h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+ h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED],
DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 );
}
@@ -892,11 +1112,12 @@
if( h->param.b_cabac )
return quant_trellis_cabac( h, dct,
- h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+ h->quant4_mf[quant_cat][i_qp], h->quant4_bias0[quant_cat][i_qp],
+ h->unquant4_mf[quant_cat][i_qp], zigzag,
DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx );
return quant_trellis_cavlc( h, dct,
- h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+ h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], zigzag,
DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 );
}
@@ -907,14 +1128,12 @@
int b_ac = ctx_ac[ctx_block_cat];
if( h->param.b_cabac )
return quant_trellis_cabac( h, dct,
- h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
- x264_dct4_weight2_zigzag[MB_INTERLACED],
- x264_zigzag_scan4[MB_INTERLACED],
+ h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp],
+ h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED],
ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx );
return quant_trellis_cavlc( h, dct,
h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
- x264_dct4_weight2_zigzag[MB_INTERLACED],
x264_zigzag_scan4[MB_INTERLACED],
ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx, 0 );
}
@@ -925,9 +1144,8 @@
if( h->param.b_cabac )
{
return quant_trellis_cabac( h, dct,
- h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
- x264_dct8_weight2_zigzag[MB_INTERLACED],
- x264_zigzag_scan8[MB_INTERLACED],
+ h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias0[i_quant_cat][i_qp],
+ h->unquant8_mf[i_quant_cat][i_qp], x264_zigzag_scan8[MB_INTERLACED],
ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 64, idx );
}
@@ -937,7 +1155,6 @@
{
int nz = quant_trellis_cavlc( h, dct,
h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
- x264_dct8_weight2_zigzag[MB_INTERLACED],
x264_zigzag_scan8[MB_INTERLACED],
DCT_LUMA_4x4, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 16, idx*4+i, 1 );
/* Set up nonzero count for future calls */
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/encoder/slicetype.c
^
|
@@ -283,7 +283,7 @@
return cost;
}
-void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
{
int i_delta_index = fenc->i_frame - ref->i_frame - 1;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/filters/video/resize.c
^
|
@@ -32,9 +32,10 @@
static int full_check( video_info_t *info, x264_param_t *param )
{
int required = 0;
- required |= info->csp != param->i_csp;
- required |= info->width != param->i_width;
- required |= info->height != param->i_height;
+ required |= info->csp != param->i_csp;
+ required |= info->width != param->i_width;
+ required |= info->height != param->i_height;
+ required |= info->fullrange != param->vui.b_fullrange;
return required;
}
@@ -44,11 +45,16 @@
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
+#ifndef PIX_FMT_BGRA64
+#define PIX_FMT_BGRA64 PIX_FMT_NONE
+#endif
+
typedef struct
{
int width;
int height;
int pix_fmt;
+ int range;
} frame_prop_t;
typedef struct
@@ -59,6 +65,7 @@
cli_pic_t buffer;
int buffer_allocated;
int dst_csp;
+ int input_range;
struct SwsContext *ctx;
uint32_t ctx_flags;
/* state of swapping chroma planes pre and post resize */
@@ -142,62 +149,63 @@
case X264_CSP_YV24: /* specially handled via swapping chroma */
case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P;
case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_RGB48 : PIX_FMT_RGB24;
- /* the next 3 csps have no equivalent 16bit depth in swscale */
+ case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_BGR48 : PIX_FMT_BGR24;
+ case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_BGRA64 : PIX_FMT_BGRA;
+ /* the next csp has no equivalent 16bit depth in swscale */
case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE : PIX_FMT_NV12;
- case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE : PIX_FMT_BGR24;
- case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_NONE : PIX_FMT_BGRA;
default: return PIX_FMT_NONE;
}
}
+static int pix_number_of_planes( const AVPixFmtDescriptor *pix_desc )
+{
+ int num_planes = 0;
+ for( int i = 0; i < pix_desc->nb_components; i++ )
+ {
+ int plane_plus1 = pix_desc->comp[i].plane + 1;
+ num_planes = X264_MAX( plane_plus1, num_planes );
+ }
+ return num_planes;
+}
+
static int pick_closest_supported_csp( int csp )
{
int pix_fmt = convert_csp_to_pix_fmt( csp );
- switch( pix_fmt )
+ // first determine the base csp
+ int ret = X264_CSP_NONE;
+ const AVPixFmtDescriptor *pix_desc = av_pix_fmt_descriptors+pix_fmt;
+ if( (unsigned)pix_fmt >= PIX_FMT_NB || !pix_desc->name )
+ return ret;
+
+ const char *pix_fmt_name = pix_desc->name;
+ int is_rgb = pix_desc->flags & (PIX_FMT_RGB | PIX_FMT_PAL);
+ int is_bgr = !!strstr( pix_fmt_name, "bgr" );
+ if( is_bgr || is_rgb )
+ {
+ if( pix_desc->nb_components == 4 ) // has alpha
+ ret = X264_CSP_BGRA;
+ else if( is_bgr )
+ ret = X264_CSP_BGR;
+ else
+ ret = X264_CSP_RGB;
+ }
+ else
{
- case PIX_FMT_YUV420P16LE:
- case PIX_FMT_YUV420P16BE:
- return X264_CSP_I420 | X264_CSP_HIGH_DEPTH;
- case PIX_FMT_YUV422P:
- case PIX_FMT_YUYV422:
- case PIX_FMT_UYVY422:
- case PIX_FMT_YUVJ422P:
- return X264_CSP_I422;
- case PIX_FMT_YUV422P16LE:
- case PIX_FMT_YUV422P16BE:
- return X264_CSP_I422 | X264_CSP_HIGH_DEPTH;
- case PIX_FMT_YUV444P:
- case PIX_FMT_YUVJ444P:
- return X264_CSP_I444;
- case PIX_FMT_YUV444P16LE:
- case PIX_FMT_YUV444P16BE:
- return X264_CSP_I444 | X264_CSP_HIGH_DEPTH;
- case PIX_FMT_RGB24:
- case PIX_FMT_RGB565BE:
- case PIX_FMT_RGB565LE:
- case PIX_FMT_RGB555BE:
- case PIX_FMT_RGB555LE:
- return X264_CSP_RGB;
- case PIX_FMT_RGB48BE:
- case PIX_FMT_RGB48LE:
- return X264_CSP_RGB | X264_CSP_HIGH_DEPTH;
- case PIX_FMT_BGR24:
- case PIX_FMT_BGR565BE:
- case PIX_FMT_BGR565LE:
- case PIX_FMT_BGR555BE:
- case PIX_FMT_BGR555LE:
- return X264_CSP_BGR;
- case PIX_FMT_ARGB:
- case PIX_FMT_RGBA:
- case PIX_FMT_ABGR:
- case PIX_FMT_BGRA:
- return X264_CSP_BGRA;
- case PIX_FMT_NV12:
- case PIX_FMT_NV21:
- return X264_CSP_NV12;
- default:
- return X264_CSP_I420;
+ // yuv-based
+ if( pix_desc->nb_components == 1 || pix_desc->nb_components == 2 ) // no chroma
+ ret = X264_CSP_I420;
+ else if( pix_desc->log2_chroma_w && pix_desc->log2_chroma_h ) // reduced chroma width & height
+ ret = (pix_desc->nb_components == pix_number_of_planes( pix_desc )) ? X264_CSP_I420 : X264_CSP_NV12;
+ else if( pix_desc->log2_chroma_w ) // reduced chroma width only
+ ret = (pix_desc->nb_components == pix_number_of_planes( pix_desc )) ? X264_CSP_I422 : X264_CSP_NV16;
+ else
+ ret = X264_CSP_I444;
}
+ // now determine high depth
+ for( int i = 0; i < pix_desc->nb_components; i++ )
+ if( pix_desc->comp[i].depth_minus1 >= 8 )
+ ret |= X264_CSP_HIGH_DEPTH;
+ return ret;
}
static int handle_opts( const char **optlist, char **opts, video_info_t *info, resizer_hnd_t *h )
@@ -343,57 +351,29 @@
return 0;
}
-static int handle_jpeg( int *format )
-{
- switch( *format )
- {
- case PIX_FMT_YUVJ420P:
- *format = PIX_FMT_YUV420P;
- return 1;
- case PIX_FMT_YUVJ422P:
- *format = PIX_FMT_YUV422P;
- return 1;
- case PIX_FMT_YUVJ444P:
- *format = PIX_FMT_YUV444P;
- return 1;
- case PIX_FMT_YUVJ440P:
- *format = PIX_FMT_YUV440P;
- return 1;
- default:
- return 0;
- }
-}
-
static int x264_init_sws_context( resizer_hnd_t *h )
{
+ if( h->ctx )
+ sws_freeContext( h->ctx );
+ h->ctx = sws_alloc_context();
if( !h->ctx )
- {
- h->ctx = sws_alloc_context();
- if( !h->ctx )
- return -1;
+ return -1;
- /* set flags that will not change */
- int dst_format = h->dst.pix_fmt;
- int dst_range = handle_jpeg( &dst_format );
- av_set_int( h->ctx, "sws_flags", h->ctx_flags );
- av_set_int( h->ctx, "dstw", h->dst.width );
- av_set_int( h->ctx, "dsth", h->dst.height );
- av_set_int( h->ctx, "dst_format", dst_format );
- av_set_int( h->ctx, "dst_range", dst_range ); /* FIXME: use the correct full range value */
- }
-
- int src_format = h->scale.pix_fmt;
- int src_range = handle_jpeg( &src_format );
- av_set_int( h->ctx, "srcw", h->scale.width );
- av_set_int( h->ctx, "srch", h->scale.height );
- av_set_int( h->ctx, "src_format", src_format );
- av_set_int( h->ctx, "src_range", src_range ); /* FIXME: use the correct full range value */
+ av_opt_set_int( h->ctx, "sws_flags", h->ctx_flags, 0 );
+ av_opt_set_int( h->ctx, "dstw", h->dst.width, 0 );
+ av_opt_set_int( h->ctx, "dsth", h->dst.height, 0 );
+ av_opt_set_int( h->ctx, "dst_format", h->dst.pix_fmt, 0 );
+ av_opt_set_int( h->ctx, "dst_range", h->dst.range, 0 );
+
+ av_opt_set_int( h->ctx, "srcw", h->scale.width, 0 );
+ av_opt_set_int( h->ctx, "srch", h->scale.height, 0 );
+ av_opt_set_int( h->ctx, "src_format", h->scale.pix_fmt, 0 );
+ av_opt_set_int( h->ctx, "src_range", h->scale.range, 0 );
- /* FIXME: use the correct full range values
- * FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) */
+ /* FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) */
sws_setColorspaceDetails( h->ctx,
- sws_getCoefficients( SWS_CS_DEFAULT ), src_range,
- sws_getCoefficients( SWS_CS_DEFAULT ), av_get_int( h->ctx, "dst_range", NULL ),
+ sws_getCoefficients( SWS_CS_DEFAULT ), h->scale.range,
+ sws_getCoefficients( SWS_CS_DEFAULT ), h->dst.range,
0, 1<<16, 1<<16 );
return sws_init_context( h->ctx, NULL, NULL ) < 0;
@@ -401,7 +381,7 @@
static int check_resizer( resizer_hnd_t *h, cli_pic_t *in )
{
- frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ) };
+ frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ), h->input_range };
if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) )
return 0;
/* also warn if the resizer was initialized after the first frame */
@@ -440,16 +420,14 @@
h->dst_csp = info->csp;
h->dst.width = info->width;
h->dst.height = info->height;
+ h->dst.range = info->fullrange; // maintain input range
if( !strcmp( opt_string, "normcsp" ) )
{
/* only in normalization scenarios is the input capable of changing properties */
h->variable_input = 1;
h->dst_csp = pick_closest_supported_csp( info->csp );
- /* now fix the catch-all i420 choice if it does not allow for the current input resolution dimensions. */
- if( h->dst_csp == X264_CSP_I420 && info->width&1 )
- h->dst_csp = X264_CSP_I444;
- if( h->dst_csp == X264_CSP_I420 && info->height&1 )
- h->dst_csp = X264_CSP_I422;
+ FAIL_IF_ERROR( h->dst_csp == X264_CSP_NONE,
+ "filter get invalid input pixel format %d (colorspace %d)\n", convert_csp_to_pix_fmt( info->csp ), info->csp )
}
else if( handle_opts( optlist, opts, info, h ) )
return -1;
@@ -459,6 +437,7 @@
h->dst_csp = param->i_csp;
h->dst.width = param->i_width;
h->dst.height = param->i_height;
+ h->dst.range = param->vui.b_fullrange; // change to libx264's range
}
h->ctx_flags = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) );
x264_free_string_array( opts );
@@ -467,6 +446,7 @@
h->ctx_flags |= SWS_FULL_CHR_H_INT | SWS_FULL_CHR_H_INP | SWS_ACCURATE_RND;
h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp );
h->scale = h->dst;
+ h->input_range = info->fullrange;
/* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */
int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER);
@@ -500,6 +480,9 @@
if( h->dst.pix_fmt != src_pix_fmt )
x264_cli_log( NAME, X264_LOG_WARNING, "converting from %s to %s\n",
av_get_pix_fmt_name( src_pix_fmt ), av_get_pix_fmt_name( h->dst.pix_fmt ) );
+ else if( h->dst.range != h->input_range )
+ x264_cli_log( NAME, X264_LOG_WARNING, "converting range from %s to %s\n",
+ h->input_range ? "PC" : "TV", h->dst.range ? "PC" : "TV" );
h->dst_csp |= info->csp & X264_CSP_VFLIP; // preserve vflip
/* if the input is not variable, initialize the context */
@@ -511,9 +494,10 @@
}
/* finished initing, overwrite values */
- info->csp = h->dst_csp;
- info->width = h->dst.width;
- info->height = h->dst.height;
+ info->csp = h->dst_csp;
+ info->width = h->dst.width;
+ info->height = h->dst.height;
+ info->fullrange = h->dst.range;
h->prev_filter = *filter;
h->prev_hnd = *handle;
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/input/avs.c
^
|
@@ -235,14 +235,40 @@
"input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height )
FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1),
"input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
- const char *arg_name[2] = { NULL, "interlaced" };
- AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
char conv_func[14] = { "ConvertTo" };
strcat( conv_func, csp );
- AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, 2 ), arg_name );
+ char matrix[7] = "";
+ int arg_count = 2;
+ /* if doing a rgb <-> yuv conversion then range is handled via 'matrix'. though it's only supported in 2.56+ */
+ if( avs_version >= 2.56f && ((opt->output_csp == X264_CSP_RGB && avs_is_yuv( vi )) || (opt->output_csp != X264_CSP_RGB && avs_is_rgb( vi ))) )
+ {
+ // if converting from yuv, then we specify the matrix for the input, otherwise use the output's.
+ int use_pc_matrix = avs_is_yuv( vi ) ? opt->input_range == RANGE_PC : opt->output_range == RANGE_PC;
+ strcpy( matrix, use_pc_matrix ? "PC." : "Rec" );
+ strcat( matrix, "601" ); /* FIXME: use correct coefficients */
+ arg_count++;
+ // notification that the input range has changed to the desired one
+ opt->input_range = opt->output_range;
+ }
+ const char *arg_name[] = { NULL, "interlaced", "matrix" };
+ AVS_Value arg_arr[] = { res, avs_new_value_bool( info->interlaced ), avs_new_value_string( matrix ) };
+ AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, arg_count ), arg_name );
FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp )
res = update_clip( h, &vi, res2, res );
}
+ /* if swscale is not available, change the range if necessary. This only applies to YUV-based CSPs however */
+ if( avs_is_yuv( vi ) && opt->output_range != RANGE_AUTO && ((opt->input_range == RANGE_PC) != opt->output_range) )
+ {
+ const char *levels = opt->output_range ? "TV->PC" : "PC->TV";
+ x264_cli_log( "avs", X264_LOG_WARNING, "performing %s conversion\n", levels );
+ AVS_Value arg_arr[] = { res, avs_new_value_string( levels ) };
+ const char *arg_name[] = { NULL, "levels" };
+ AVS_Value res2 = h->func.avs_invoke( h->env, "ColorYUV", avs_new_value_array( arg_arr, 2 ), arg_name );
+ FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert range: %s\n", avs_as_error( res2 ) )
+ res = update_clip( h, &vi, res2, res );
+ // notification that the input range has changed to the desired one
+ opt->input_range = opt->output_range;
+ }
#endif
h->func.avs_release_value( res );
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/input/ffms.c
^
|
@@ -65,6 +65,18 @@
return 0;
}
+/* handle the deprecated jpeg pixel formats */
+static int handle_jpeg( int csp, int *fullrange )
+{
+ switch( csp )
+ {
+ case PIX_FMT_YUVJ420P: *fullrange = 1; return PIX_FMT_YUV420P;
+ case PIX_FMT_YUVJ422P: *fullrange = 1; return PIX_FMT_YUV422P;
+ case PIX_FMT_YUVJ444P: *fullrange = 1; return PIX_FMT_YUV444P;
+ default: return csp;
+ }
+}
+
static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
{
ffms_hnd_t *h = calloc( 1, sizeof(ffms_hnd_t) );
@@ -119,11 +131,13 @@
const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, 0, &e );
FAIL_IF_ERROR( !frame, "could not read frame 0\n" )
+ info->fullrange = 0;
info->width = frame->EncodedWidth;
info->height = frame->EncodedHeight;
- info->csp = frame->EncodedPixelFormat | X264_CSP_OTHER;
+ info->csp = handle_jpeg( frame->EncodedPixelFormat, &info->fullrange ) | X264_CSP_OTHER;
info->interlaced = frame->InterlacedFrame;
info->tff = frame->TopFieldFirst;
+ info->fullrange |= frame->ColorRange == FFMS_CR_JPEG;
/* ffms timestamps are in milliseconds. ffms also uses int64_ts for timebase,
* so we need to reduce large timebases to prevent overflow */
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/input/input.h
^
|
@@ -42,6 +42,8 @@
int seek;
int progress;
int output_csp; /* convert to this csp, if applicable */
+ int output_range; /* user desired output range */
+ int input_range; /* user override input range */
} cli_input_opt_t;
/* properties of the source given by the demuxer */
@@ -50,6 +52,8 @@
int csp; /* colorspace of the input */
uint32_t fps_num;
uint32_t fps_den;
+ int fullrange; /* has 2^bit_depth-1 instead of 219*2^(bit_depth-8) ranges (YUV only) */
+ int width;
int height;
int interlaced;
int num_frames;
@@ -60,7 +64,6 @@
uint32_t timebase_num;
uint32_t timebase_den;
int vfr;
- int width;
} video_info_t;
/* image data type used by x264cli */
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/input/lavf.c
^
|
@@ -46,6 +46,18 @@
av_init_packet( pkt );\
}
+/* handle the deprecated jpeg pixel formats */
+static int handle_jpeg( int csp, int *fullrange )
+{
+ switch( csp )
+ {
+ case PIX_FMT_YUVJ420P: *fullrange = 1; return PIX_FMT_YUV420P;
+ case PIX_FMT_YUVJ422P: *fullrange = 1; return PIX_FMT_YUV422P;
+ case PIX_FMT_YUVJ444P: *fullrange = 1; return PIX_FMT_YUV444P;
+ default: return csp;
+ }
+}
+
static int read_frame_internal( cli_pic_t *p_pic, lavf_hnd_t *h, int i_frame, video_info_t *info )
{
if( h->first_pic && !info )
@@ -101,14 +113,16 @@
memcpy( p_pic->img.stride, frame.linesize, sizeof(p_pic->img.stride) );
memcpy( p_pic->img.plane, frame.data, sizeof(p_pic->img.plane) );
- p_pic->img.height = c->height;
- p_pic->img.csp = c->pix_fmt | X264_CSP_OTHER;
+ int is_fullrange = 0;
p_pic->img.width = c->width;
+ p_pic->img.height = c->height;
+ p_pic->img.csp = handle_jpeg( c->pix_fmt, &is_fullrange ) | X264_CSP_OTHER;
if( info )
{
+ info->fullrange = is_fullrange;
info->interlaced = frame.interlaced_frame;
- info->tff = frame.top_field_first;
+ info->tff = frame.top_field_first;
}
if( h->vfr_input )
@@ -186,6 +200,7 @@
info->num_frames = h->lavf->streams[i]->nb_frames;
info->sar_height = c->sample_aspect_ratio.den;
info->sar_width = c->sample_aspect_ratio.num;
+ info->fullrange |= c->color_range == AVCOL_RANGE_JPEG;
/* avisynth stores rgb data vertically flipped. */
if( !strcasecmp( get_filename_extension( psz_filename ), "avs" ) &&
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/input/timecode.c
^
|
@@ -368,8 +368,6 @@
timecode_input.picture_alloc = h->input.picture_alloc;
timecode_input.picture_clean = h->input.picture_clean;
- *p_handle = h;
-
tcfile_in = fopen( psz_filename, "rb" );
FAIL_IF_ERROR( !tcfile_in, "can't open `%s'\n", psz_filename )
else if( !x264_is_regular_file( tcfile_in ) )
@@ -392,6 +390,7 @@
info->timebase_den = h->timebase_den;
info->vfr = 1;
+ *p_handle = h;
return 0;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/tools/checkasm-a.asm
^
|
@@ -32,8 +32,6 @@
%ifdef WIN64
; just random numbers to reduce the chance of incidental match
ALIGN 16
-n4: dq 0xa77809bf11b239d1
-n5: dq 0x2ba9bf3d2f05b389
x6: ddq 0x79445c159ce790641a1b2550a612b48c
x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd
x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943
@@ -44,6 +42,14 @@
x13: ddq 0xdd7b8919edd427862e8ec680de14b47c
x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf
x15: ddq 0x6de8f4c914c334d5011ff554472a7a10
+n7: dq 0x21f86d66c8ca00ce
+n8: dq 0x75b6ba21077c48ad
+n9: dq 0xed56bb2dcb3c7736
+n10: dq 0x8bda43d3fd1a7e06
+n11: dq 0xb64a9c9e5d318408
+n12: dq 0xdf9a54b303f1d3a3
+n13: dq 0x4a75479abd64e097
+n14: dq 0x249214109d5d1c88
%endif
SECTION .text
@@ -52,7 +58,7 @@
; max number of args used by any x264 asm function.
; (max_args % 4) must equal 3 for stack alignment
-%define max_args 11
+%define max_args 15
%ifdef WIN64
@@ -60,9 +66,8 @@
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal checkasm_call, 4,7,16
- sub rsp, max_args*8
- %assign stack_offset stack_offset+max_args*8
+cglobal checkasm_call, 4,15,16
+ SUB rsp, max_args*8
mov r6, r0
mov [rsp+stack_offset+16], r1
mov r0, r2
@@ -77,25 +82,30 @@
%endrep
%assign i 6
%rep 16-6
- movdqa xmm %+ i, [x %+ i]
+ mova m %+ i, [x %+ i]
+ %assign i i+1
+%endrep
+%assign i 7
+%rep 15-7
+ mov r %+ i, [n %+ i]
%assign i i+1
%endrep
- mov r4, [n4]
- mov r5, [n5]
call r6
- xor r4, [n4]
- xor r5, [n5]
- or r4, r5
- pxor xmm5, xmm5
+%assign i 7
+%rep 15-7
+ xor r %+ i, [n %+ i]
+ or r7, r %+ i
+ %assign i i+1
+%endrep
%assign i 6
%rep 16-6
- pxor xmm %+ i, [x %+ i]
- por xmm5, xmm %+ i
+ pxor m %+ i, [x %+ i]
+ por m6, m %+ i
%assign i i+1
%endrep
- packsswb xmm5, xmm5
- movq r5, xmm5
- or r4, r5
+ packsswb m6, m6
+ movq r5, m6
+ or r7, r5
jz .ok
mov r4, rax
lea r0, [error_message]
@@ -104,8 +114,7 @@
mov dword [r1], 0
mov rax, r4
.ok:
- add rsp, max_args*8
- %assign stack_offset stack_offset-max_args*8
+ ADD rsp, max_args*8
RET
%elifndef ARCH_X86_64
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/tools/checkasm.c
^
|
@@ -55,7 +55,7 @@
#define BENCH_RUNS 100 // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
-#define MAX_CPUS 10 // number of different combinations of cpu flags
+#define MAX_CPUS 30 // number of different combinations of cpu flags
typedef struct
{
@@ -168,11 +168,10 @@
b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SSE4 ? "sse4" :
- b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
/* print sse2slow only if there's also a sse2fast version of the same func */
- b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
+ b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" :
b->cpu&X264_CPU_ALTIVEC ? "altivec" :
@@ -180,6 +179,7 @@
b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
+ b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
@@ -426,6 +426,10 @@
}
report( "pixel hadamard_ac :" );
+ // maximize sum
+ for( int i = 0; i < 32; i++ )
+ for( int j = 0; j < 16; j++ )
+ pbuf4[16*i+j] = -((i+j)&1) & PIXEL_MAX;
ok = 1; used_asm = 0;
if( pixel_asm.vsad != pixel_ref.vsad )
{
@@ -434,13 +438,17 @@
int res_c, res_asm;
set_func_name( "vsad" );
used_asm = 1;
- res_c = call_c( pixel_c.vsad, pbuf1, 16, h );
- res_asm = call_a( pixel_asm.vsad, pbuf1, 16, h );
- if( res_c != res_asm )
+ for( int j = 0; j < 2 && ok; j++ )
{
- ok = 0;
- fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
- break;
+ pixel *p = j ? pbuf4 : pbuf1;
+ res_c = call_c( pixel_c.vsad, p, 16, h );
+ res_asm = call_a( pixel_asm.vsad, p, 16, h );
+ if( res_c != res_asm )
+ {
+ ok = 0;
+ fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
+ break;
+ }
}
}
}
@@ -516,8 +524,10 @@
set_func_name( #name ); \
used_asm = 1; \
ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
- ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ) = {0}; \
- ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ) = {0}; \
+ ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
+ ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
+ memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
+ memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
for( int i=0; i<17; i++ ) \
bitcosts[i] = 9*(i!=8); \
for( int i=0; i<32; i++ ) \
@@ -649,7 +659,8 @@
{
ALIGNED_16( uint16_t sums[72] );
ALIGNED_16( int dc[4] );
- int16_t mvs_a[32], mvs_c[32];
+ ALIGNED_16( int16_t mvs_a[32] );
+ ALIGNED_16( int16_t mvs_c[32] );
int mvn_a, mvn_c;
int thresh = rand() & 0x3fff;
set_func_name( "esa_ads" );
@@ -718,8 +729,8 @@
{
int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1));
int cond_b = (i == 0) ? 1 : !cond_a;
- enc[0] = enc[1] = cond_a ? PIXEL_MAX : 0;
- enc[2] = enc[3] = cond_b ? PIXEL_MAX : 0;
+ enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0;
+ enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0;
for( int k = 0; k < 4; k++ )
dec[k] = PIXEL_MAX - enc[k];
@@ -744,6 +755,12 @@
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
+ for( int k = 0; k < size; k++ )\
+ printf( "%d ", ((dctcoef*)t1)[k] );\
+ printf("\n");\
+ for( int k = 0; k < size; k++ )\
+ printf( "%d ", ((dctcoef*)t2)[k] );\
+ printf("\n");\
break; \
} \
call_c( dct_c.name, t1, enc, dec ); \
@@ -1554,11 +1571,15 @@
TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
+ TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] );
+ TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] );
TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
TEST_DEBLOCK( deblock_luma_intra[0], 0 );
TEST_DEBLOCK( deblock_luma_intra[1], 1 );
TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
+ TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 );
+ TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 );
TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
if( db_a.deblock_strength != db_ref.deblock_strength )
@@ -1998,6 +2019,7 @@
int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
+ runlevel_c.mask != runlevel_a.mask || \
memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
{ \
@@ -2293,6 +2315,9 @@
{
*cpu_ref = *cpu_new;
*cpu_new |= flags;
+#if BROKEN_STACK_ALIGNMENT
+ *cpu_new |= X264_CPU_STACK_MOD4;
+#endif
if( *cpu_new & X264_CPU_SSE2_IS_FAST )
*cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
if( !quiet )
@@ -2327,6 +2352,7 @@
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
@@ -2336,23 +2362,24 @@
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN;
}
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
if( x264_cpu_detect() & X264_CPU_SSE3 )
+ {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ }
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
@@ -2361,10 +2388,7 @@
cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
- {
- cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
- }
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
if( x264_cpu_detect() & X264_CPU_AVX )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
if( x264_cpu_detect() & X264_CPU_XOP )
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/version.sh
^
|
@@ -1,4 +1,5 @@
#!/bin/bash
+[ -n "$1" ] && cd $1
git rev-list HEAD | sort > config.git-hash
LOCALVER=`wc -l config.git-hash | awk '{print $1}'`
if [ $LOCALVER \> 1 ] ; then
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/x264.c
^
|
@@ -53,6 +53,7 @@
#endif
#if HAVE_SWSCALE
+#undef DECLARE_ALIGNED
#include <libswscale/swscale.h>
#endif
@@ -135,6 +136,8 @@
0
};
+static const char * const range_names[] = { "auto", "tv", "pc", 0 };
+
typedef struct
{
int mod;
@@ -347,19 +350,22 @@
size_t line_len = strlen( INDENT );
for( enum PixelFormat i = PIX_FMT_NONE+1; i < PIX_FMT_NB; i++ )
{
- const char *pfname = av_pix_fmt_descriptors[i].name;
- size_t name_len = strlen( pfname );
- if( line_len + name_len > (80 - strlen( ", " )) )
- {
- printf( "\n" INDENT );
- line_len = strlen( INDENT );
- }
- printf( "%s", pfname );
- line_len += name_len;
- if( i+1 < PIX_FMT_NB )
+ const char *pfname = av_get_pix_fmt_name( i );
+ if( pfname )
{
- printf( ", " );
- line_len += 2;
+ size_t name_len = strlen( pfname );
+ if( line_len + name_len > (80 - strlen( ", " )) )
+ {
+ printf( "\n" INDENT );
+ line_len = strlen( INDENT );
+ }
+ printf( "%s", pfname );
+ line_len += name_len;
+ if( i+1 < PIX_FMT_NB )
+ {
+ printf( ", " );
+ line_len += 2;
+ }
}
}
#endif
@@ -734,9 +740,8 @@
H2( " --videoformat <string> Specify video format [\"%s\"]\n"
" - component, pal, ntsc, secam, mac, undef\n",
strtable_lookup( x264_vidformat_names, defaults->vui.i_vidformat ) );
- H2( " --fullrange <string> Specify full range samples setting [\"%s\"]\n"
- " - off, on\n",
- strtable_lookup( x264_fullrange_names, defaults->vui.b_fullrange ) );
+ H2( " --range <string> Specify color range [\"%s\"]\n"
+ " - %s\n", range_names[0], stringify_names( buf, range_names ) );
H2( " --colorprim <string> Specify color primaries [\"%s\"]\n"
" - undef, bt709, bt470m, bt470bg\n"
" smpte170m, smpte240m, film\n",
@@ -772,6 +777,8 @@
H1( " --output-csp <string> Specify output colorspace [\"%s\"]\n"
" - %s\n", output_csp_names[0], stringify_names( buf, output_csp_names ) );
H1( " --input-depth <integer> Specify input bit depth for raw input\n" );
+ H1( " --input-range <string> Specify input color range [\"%s\"]\n"
+ " - %s\n", range_names[0], stringify_names( buf, range_names ) );
H1( " --input-res <intxint> Specify input resolution (width x height)\n" );
H1( " --index <string> Filename for input index file\n" );
H0( " --sar width:height Specify Sample Aspect Ratio\n" );
@@ -853,7 +860,9 @@
OPT_INPUT_CSP,
OPT_INPUT_DEPTH,
OPT_DTS_COMPRESSION,
- OPT_OUTPUT_CSP
+ OPT_OUTPUT_CSP,
+ OPT_INPUT_RANGE,
+ OPT_RANGE
} OptionsOPT;
static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw";
@@ -990,7 +999,7 @@
{ "cqm8p", required_argument, NULL, 0 },
{ "overscan", required_argument, NULL, 0 },
{ "videoformat", required_argument, NULL, 0 },
- { "fullrange", required_argument, NULL, 0 },
+ { "range", required_argument, NULL, OPT_RANGE },
{ "colorprim", required_argument, NULL, 0 },
{ "transfer", required_argument, NULL, 0 },
{ "colormatrix", required_argument, NULL, 0 },
@@ -1013,6 +1022,7 @@
{ "input-depth", required_argument, NULL, OPT_INPUT_DEPTH },
{ "dts-compress", no_argument, NULL, OPT_DTS_COMPRESSION },
{ "output-csp", required_argument, NULL, OPT_OUTPUT_CSP },
+ { "input-range", required_argument, NULL, OPT_INPUT_RANGE },
{0, 0, 0, 0}
};
@@ -1176,6 +1186,9 @@
else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
param->i_csp = X264_CSP_RGB;
param->i_csp |= info->csp & X264_CSP_HIGH_DEPTH;
+ /* if the output range is not forced, assign it to the input one now */
+ if( param->vui.b_fullrange == RANGE_AUTO )
+ param->vui.b_fullrange = info->fullrange;
if( x264_init_vid_filter( "resize", handle, &filter, info, param, NULL ) )
return -1;
@@ -1237,6 +1250,7 @@
memset( &input_opt, 0, sizeof(cli_input_opt_t) );
memset( &output_opt, 0, sizeof(cli_output_opt_t) );
input_opt.bit_depth = 8;
+ input_opt.input_range = input_opt.output_range = param->vui.b_fullrange = RANGE_AUTO;
int output_csp = defaults.i_csp;
opt->b_progress = 1;
@@ -1402,6 +1416,14 @@
#endif
param->i_csp = output_csp = output_csp_fix[output_csp];
break;
+ case OPT_INPUT_RANGE:
+ FAIL_IF_ERROR( parse_enum_value( optarg, range_names, &input_opt.input_range ), "Unknown input range `%s'\n", optarg )
+ input_opt.input_range += RANGE_AUTO;
+ break;
+ case OPT_RANGE:
+ FAIL_IF_ERROR( parse_enum_value( optarg, range_names, ¶m->vui.b_fullrange ), "Unknown range `%s'\n", optarg );
+ input_opt.output_range = param->vui.b_fullrange += RANGE_AUTO;
+ break;
default:
generic_option:
{
@@ -1452,10 +1474,11 @@
video_info_t info = {0};
char demuxername[5];
- /* set info flags to param flags to be overwritten by demuxer as necessary. */
+ /* set info flags to be overwritten by demuxer as necessary. */
info.csp = param->i_csp;
info.fps_num = param->i_fps_num;
info.fps_den = param->i_fps_den;
+ info.fullrange = input_opt.input_range == RANGE_PC;
info.interlaced = param->b_interlaced;
info.sar_width = param->vui.i_sar_width;
info.sar_height = param->vui.i_sar_height;
@@ -1540,6 +1563,8 @@
info.interlaced = param->b_interlaced;
info.tff = param->b_tff;
}
+ if( input_opt.input_range != RANGE_AUTO )
+ info.fullrange = input_opt.input_range;
if( init_vid_filters( vid_filters, &opt->hin, &info, param, output_csp ) )
return -1;
@@ -1571,6 +1596,15 @@
x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, but not compiled with interlaced support\n" );
#endif
}
+ /* if the user never specified the output range and the input is now rgb, default it to pc */
+ int csp = param->i_csp & X264_CSP_MASK;
+ if( csp >= X264_CSP_BGR && csp <= X264_CSP_RGB )
+ {
+ if( input_opt.output_range == RANGE_AUTO )
+ param->vui.b_fullrange = RANGE_PC;
+ /* otherwise fail if they specified tv */
+ FAIL_IF_ERROR( !param->vui.b_fullrange, "RGB must be PC range" )
+ }
/* Automatically reduce reference frame count to match the user's target level
* if the user didn't explicitly set a reference frame count. */
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/x264.h
^
|
@@ -41,7 +41,7 @@
#include "x264_config.h"
-#define X264_BUILD 119
+#define X264_BUILD 120
/* x264_t:
* opaque handler for encoder */
@@ -724,8 +724,7 @@
x264_hrd_t hrd_timing;
/* In: arbitrary user SEI (e.g subtitles, AFDs) */
x264_sei_t extra_sei;
- /* private user data. libx264 doesn't touch this,
- not even copy it from input to output frames. */
+ /* private user data. copied from input to output frames. */
void *opaque;
} x264_picture_t;
|
[-]
[+]
|
Changed |
x264-snapshot-20120126-2245.tar.bz2/x264cli.h
^
|
@@ -72,4 +72,11 @@
#define FAIL_IF_ERR( cond, name, ... ) RETURN_IF_ERR( cond, name, -1, __VA_ARGS__ )
+typedef enum
+{
+ RANGE_AUTO = -1,
+ RANGE_TV,
+ RANGE_PC
+} range_enum;
+
#endif
|