Changes - j0ke.net Open Build Service

@@ -1,18 +0,0 @@ -diff -ur x264-snapshot-20110225-2245.orig/encoder/encoder.c x264-snapshot-20110225-2245/encoder/encoder.c ---- x264-snapshot-20110225-2245.orig/encoder/encoder.c 2011-02-25 22:45:04.000000000 +0100 -+++ x264-snapshot-20110225-2245/encoder/encoder.c 2011-02-26 14:24:02.144281162 +0100 -@@ -487,12 +487,8 @@ - score += h->param.analyse.inter == 0 && h->param.analyse.i_subpel_refine == 8; - if( score >= 5 ) - { -- x264_log( h, X264_LOG_ERROR, "broken ffmpeg default settings detected\n" ); -- x264_log( h, X264_LOG_ERROR, "use an encoding preset (e.g. -vpre medium)\n" ); -- x264_log( h, X264_LOG_ERROR, "preset usage: -vpre <speed> -vpre <profile>\n" ); -- x264_log( h, X264_LOG_ERROR, "speed presets are listed in x264 --help\n" ); -- x264_log( h, X264_LOG_ERROR, "profile is optional; x264 defaults to high\n" ); -- return -1; -+ /* broken ffmpeg defaults, set to h264 defaults */ -+ x264_param_default( &h->param ); - } - } -

@@ -1,31 +1,15 @@ -diff -ur x264-snapshot-20110225-2245.orig/Makefile x264-snapshot-20110225-2245/Makefile ---- x264-snapshot-20110225-2245.orig/Makefile 2011-02-25 22:45:04.000000000 +0100 -+++ x264-snapshot-20110225-2245/Makefile 2011-02-26 14:25:51.568295374 +0100 -@@ -145,9 +145,10 @@ +--- Makefile.orig 2011-05-27 22:45:04.000000000 +0200 ++++ Makefile 2011-05-28 15:18:29.883305471 +0200 +@@ -149,9 +149,10 @@ $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) - $(CC) -shared -o $@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + ln -s $(SONAME) libx264.so --x264$(EXE): $(OBJCLI) libx264.a -- $(CC) -o $@ $+ $(LDFLAGSCLI) $(LDFLAGS) -+x264$(EXE): $(OBJCLI) $(SONAME) -+ $(CC) -o $@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS) +-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) +- $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) ++x264$(EXE): .depend $(OBJCLI) $(SONAME) ++ $(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS) - checkasm: tools/checkasm.o libx264.a - $(CC) -o $@ $+ $(LDFLAGS) -@@ -219,10 +220,12 @@ - install -d $(DESTDIR)$(libdir)/pkgconfig - install -m 644 x264.h $(DESTDIR)$(includedir) - install -m 644 x264_config.h $(DESTDIR)$(includedir) -- install -m 644 libx264.a $(DESTDIR)$(libdir) - install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig - install x264$(EXE) $(DESTDIR)$(bindir) -- $(RANLIB) $(DESTDIR)$(libdir)/libx264.a -+ if [ -e libx264.a ]; then \ -+ install -m 644 libx264.a $(DESTDIR)$(libdir); \ -+ $(RANLIB) $(DESTDIR)$(libdir)/libx264.a; \ -+ fi - ifeq ($(SYS),MINGW) - $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir)) - else + checkasm: tools/checkasm.o $(LIBX264) + $(LD)$@ $+ $(LDFLAGS)

@@ -5,6 +5,9 @@ *.rej *.dll* *.exe +*.def +*.lib +*.pdb *.mo *.o *.patch

@@ -125,7 +125,7 @@ endif ifneq ($(SONAME),) -ifeq ($(SYS),MINGW) +ifeq ($(SYS),WINDOWS) SRCSO += x264dll.c endif endif @@ -135,34 +135,38 @@ OBJSO = $(SRCSO:%.c=%.o) DEP = depend -.PHONY: all default fprofiled clean distclean install uninstall dox test testclean +.PHONY: all default fprofiled clean distclean install uninstall dox test testclean lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli -default: $(DEP) x264$(EXE) +default: $(DEP) -libx264.a: .depend $(OBJS) $(OBJASM) - $(AR) rc libx264.a $(OBJS) $(OBJASM) - $(RANLIB) libx264.a +cli: x264$(EXE) +lib-static: $(LIBX264) +lib-shared: $(SONAME) + +$(LIBX264): .depend $(OBJS) $(OBJASM) + $(AR)$@ $(OBJS) $(OBJASM) + $(if $(RANLIB), $(RANLIB) $@) $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) - $(CC) -shared -o $@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) -x264$(EXE): $(OBJCLI) libx264.a - $(CC) -o $@ $+ $(LDFLAGSCLI) $(LDFLAGS) +x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) + $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) -checkasm: tools/checkasm.o libx264.a - $(CC) -o $@ $+ $(LDFLAGS) +checkasm: tools/checkasm.o $(LIBX264) + $(LD)$@ $+ $(LDFLAGS) %.o: %.asm $(AS) $(ASFLAGS) -o $@ $< - -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile + -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %.o: %.S $(AS) $(ASFLAGS) -o $@ $< - -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile + -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile .depend: config.mak @rm -f .depend - @$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) -MT $(SRC:%.c=%.o) -MM -g0 1>> .depend;) + @$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:%.c=%.o) $(DEPMM) 1>> .depend;) config.mak: ./configure @@ -191,39 +195,40 @@ else fprofiled: $(MAKE) clean - mv config.mak config.mak2 - sed -e 's/CFLAGS.*/& -fprofile-generate/; s/LDFLAGS.*/& -fprofile-generate/' config.mak2 > config.mak - $(MAKE) x264$(EXE) + $(MAKE) x264$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)" $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;)) rm -f $(SRC2:%.c=%.o) - sed -e 's/CFLAGS.*/& -fprofile-use/; s/LDFLAGS.*/& -fprofile-use/' config.mak2 > config.mak - $(MAKE) - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) - mv config.mak2 config.mak + $(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)" + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock endif clean: - rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a x264 x264.exe .depend TAGS + rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) - - sed -e 's/ *-fprofile-$generate\|use$//g' config.mak > config.mak2 && mv config.mak2 config.mak + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock distclean: clean - rm -f config.mak x264_config.h config.h config.log x264.pc + rm -f config.mak x264_config.h config.h config.log x264.pc x264.def rm -rf test/ -install: x264$(EXE) $(SONAME) +install-cli: cli install -d $(DESTDIR)$(bindir) + install x264$(EXE) $(DESTDIR)$(bindir) + +install-lib-dev: install -d $(DESTDIR)$(includedir) install -d $(DESTDIR)$(libdir) install -d $(DESTDIR)$(libdir)/pkgconfig install -m 644 x264.h $(DESTDIR)$(includedir) install -m 644 x264_config.h $(DESTDIR)$(includedir) - install -m 644 libx264.a $(DESTDIR)$(libdir) install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig - install x264$(EXE) $(DESTDIR)$(bindir) - $(RANLIB) $(DESTDIR)$(libdir)/libx264.a -ifeq ($(SYS),MINGW) + +install-lib-static: lib-static install-lib-dev + install -m 644 $(LIBX264) $(DESTDIR)$(libdir) + $(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264)) + +install-lib-shared: lib-shared install-lib-dev +ifeq ($(SYS),WINDOWS) $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir)) else $(if $(SONAME), ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX))

@@ -106,17 +106,21 @@ vst1.64 {d0-d1}, [r3,:r3align]! 32: // n is a multiple of 32 tst r2, #32 - beq 64f + beq 640f sub r2, #32 vld1.64 {d0-d3}, [r1,:r1align]! vst1.64 {d0-d3}, [r3,:r3align]! -64: // n is a multiple of 64 +640: // n is a multiple of 64 + cmp r2, #0 + beq 1f +64: subs r2, #64 vld1.64 {d0-d3}, [r1,:r1align]! vld1.64 {d4-d7}, [r1,:r1align]! vst1.64 {d0-d3}, [r3,:r3align]! vst1.64 {d4-d7}, [r3,:r3align]! bgt 64b +1: // end .if \srcalign == 8 && \dstalign == 8 vld1.64 {d0}, [r1,:64]! vst1.64 {d0}, [r3,:64]!

@@ -708,11 +708,12 @@ {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125} }; -const uint8_t x264_cabac_renorm_shift[64]= { - 6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +const uint8_t x264_cabac_renorm_shift[64] = +{ + 6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; /* -ln2(probability) */ @@ -752,24 +753,29 @@ FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000) }; +uint8_t x264_cabac_contexts[4][QP_MAX_SPEC+1][460]; + +void x264_cabac_init( void ) +{ + for( int i = 0; i < 4; i++ ) + { + const int8_t (*cabac_context_init)[460][2] = i == 0 ? &x264_cabac_context_init_I + : &x264_cabac_context_init_PB[i-1]; + for( int qp = 0; qp <= QP_MAX_SPEC; qp++ ) + for( int j = 0; j < 460; j++ ) + { + int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 ); + x264_cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6); + } + } +} /***************************************************************************** * *****************************************************************************/ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model ) { - const int8_t (*cabac_context_init)[460][2]; - - if( i_slice_type == SLICE_TYPE_I ) - cabac_context_init = &x264_cabac_context_init_I; - else - cabac_context_init = &x264_cabac_context_init_PB[i_model]; - - for( int i = 0; i < 460; i++ ) - { - int state = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 ); - cb->state[i] = (X264_MIN( state, 127-state ) << 1) | (state >> 6); - } + memcpy( cb->state, x264_cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], 460 ); } void x264_cabac_encode_init_core( x264_cabac_t *cb ) @@ -846,10 +852,11 @@ x264_cabac_encode_renorm( cb ); } +/* Note: b is negated for this function */ void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b ) { cb->i_low <<= 1; - cb->i_low += -b & cb->i_range; + cb->i_low += b & cb->i_range; cb->i_queue += 1; x264_cabac_putbyte( cb ); }

@@ -443,6 +443,7 @@ param->analyse.b_transform_8x8 = 0; param->b_cabac = 0; param->i_cqm_preset = X264_CQM_FLAT; + param->psz_cqm_file = NULL; param->i_bframe = 0; param->analyse.i_weighted_pred = X264_WEIGHTP_NONE; if( param->b_interlaced ) @@ -460,6 +461,7 @@ { param->analyse.b_transform_8x8 = 0; param->i_cqm_preset = X264_CQM_FLAT; + param->psz_cqm_file = NULL; } else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) ) { @@ -621,6 +623,8 @@ else p->i_level_idc = atoi(value); } + OPT("bluray-compat") + p->b_bluray_compat = atobool(value); OPT("sar") { b_error = ( 2 != sscanf( value, "%d:%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) && @@ -705,14 +709,7 @@ } } OPT("open-gop") - { - b_error |= parse_enum( value, x264_open_gop_names, &p->i_open_gop ); - if( b_error ) - { - b_error = 0; - p->i_open_gop = atoi(value); - } - } + p->b_open_gop = atobool(value); OPT("nf") p->b_deblocking_filter = !atobool(value); OPT2("filter", "deblock") @@ -1095,7 +1092,7 @@ void *x264_malloc( int i_size ) { uint8_t *align_buf = NULL; -#if SYS_MACOSX || (SYS_MINGW && ARCH_X86_64) +#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64) /* Mac OS X and Win x64 always returns 16 byte aligned memory */ align_buf = malloc( i_size ); #elif HAVE_MALLOC_H @@ -1121,7 +1118,7 @@ { if( p ) { -#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_MINGW && ARCH_X86_64) +#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64) free( p ); #else free( *( ( ( void **) p ) - 1 ) ); @@ -1160,7 +1157,7 @@ char *x264_slurp_file( const char *filename ) { int b_error = 0; - int i_size; + size_t i_size; char *buf; FILE *fh = fopen( filename, "rb" ); if( !fh ) @@ -1240,6 +1237,7 @@ s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction ); s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate ); s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" ); + s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat ); s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra ); @@ -1248,7 +1246,7 @@ { s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d open_gop=%d", p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, - p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->i_open_gop ); + p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->b_open_gop ); } s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );

@@ -92,6 +92,16 @@ #include <assert.h> #include <limits.h> +#if HAVE_INTERLACED +# define MB_INTERLACED h->mb.b_interlaced +# define SLICE_MBAFF h->sh.b_mbaff +# define PARAM_INTERLACED h->param.b_interlaced +#else +# define MB_INTERLACED 0 +# define SLICE_MBAFF 0 +# define PARAM_INTERLACED 0 +#endif + /* Unions for type-punning. * Mn: load or store n bits, aligned, native-endian * CPn: copy n bits, aligned, native-endian @@ -137,7 +147,7 @@ #define X264_SCAN8_LUMA_SIZE (5*8) #define X264_SCAN8_0 (4+1*8) -static const int x264_scan8[16+2*4+3] = +static const unsigned x264_scan8[16+2*4+3] = { /* Luma */ 4+1*8, 5+1*8, 4+2*8, 5+2*8, @@ -205,7 +215,8 @@ void x264_reduce_fraction( uint32_t *n, uint32_t *d ); void x264_reduce_fraction64( uint64_t *n, uint64_t *d ); -void x264_init_vlc_tables( void ); +void x264_cavlc_init( void ); +void x264_cabac_init( void ); static ALWAYS_INLINE pixel x264_clip_pixel( int x ) { @@ -310,6 +321,7 @@ SEI_USER_DATA_REGISTERED = 4, SEI_USER_DATA_UNREGISTERED = 5, SEI_RECOVERY_POINT = 6, + SEI_DEC_REF_PIC_MARKING = 7, SEI_FRAME_PACKING = 45, }; @@ -392,6 +404,15 @@ typedef struct x264_ratecontrol_t x264_ratecontrol_t; +typedef struct x264_left_table_t +{ + uint8_t intra[4]; + uint8_t nnz[4]; + uint8_t nnz_chroma[4]; + uint8_t mv[4]; + uint8_t ref[4]; +} x264_left_table_t; + struct x264_t { /* encoder parameters */ @@ -473,6 +494,10 @@ /* Slice header */ x264_slice_header_t sh; + /* Slice header backup, for SEI_DEC_REF_PIC_MARKING */ + int b_sh_backup; + x264_slice_header_t sh_backup; + /* cabac context */ x264_cabac_t cabac; @@ -549,6 +574,8 @@ int i_mb_stride; int i_b8_stride; int i_b4_stride; + int left_b8[2]; + int left_b4[2]; /* Current index */ int i_mb_x; @@ -568,17 +595,24 @@ int i_psy_trellis; /* Psy trellis strength--fixed point value*/ int b_interlaced; + int b_adaptive_mbaff; /* MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs */ /* Allowed qpel MV range to stay within the picture + emulated edge pixels */ int mv_min[2]; int mv_max[2]; + int mv_miny_row[3]; /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */ + int mv_maxy_row[3]; /* Subpel MV range for motion search. * same mv_min/max but includes levels' i_mv_range. */ int mv_min_spel[2]; int mv_max_spel[2]; + int mv_miny_spel_row[3]; + int mv_maxy_spel_row[3]; /* Fullpel MV range for motion search */ int mv_min_fpel[2]; int mv_max_fpel[2]; + int mv_miny_fpel_row[3]; + int mv_maxy_fpel_row[3]; /* neighboring MBs */ unsigned int i_neighbour; @@ -587,14 +621,22 @@ unsigned int i_neighbour_intra; /* for constrained intra pred */ unsigned int i_neighbour_frame; /* ignoring slice boundaries */ int i_mb_type_top; - int i_mb_type_left; + int i_mb_type_left[2]; int i_mb_type_topleft; int i_mb_type_topright; int i_mb_prev_xy; - int i_mb_left_xy; + int i_mb_left_xy[2]; int i_mb_top_xy; int i_mb_topleft_xy; int i_mb_topright_xy; + int i_mb_top_y; + int i_mb_topleft_y; + int i_mb_topright_y; + const x264_left_table_t *left_index_table; + int i_mb_top_mbpair_xy; + int topleft_partition; + int b_allow_skip; + int field_decoding_flag; /**** thread synchronization ends here ****/ /* subsequent variables are either thread-local or constant, @@ -617,6 +659,7 @@ int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */ uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of * NOTE: this will fail on resolutions above 2^16 MBs... */ + uint8_t *field; /* buffer for weighted versions of the reference frames */ pixel *p_weight_buf[X264_REF_MAX]; @@ -645,6 +688,7 @@ int b_reencode_mb; int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */ int b_deblock_rdo; + int b_overflow; /* If CAVLC had a level code overflow during bitstream writing. */ struct { @@ -716,11 +760,15 @@ /* number of neighbors (top and left) that used 8x8 dct */ int i_neighbour_transform_size; - int i_neighbour_interlaced; + int i_neighbour_skip; /* neighbor CBPs */ int i_cbp_top; int i_cbp_left; + + /* extra data required for mbaff in mv prediction */ + int16_t topright_mv[2][3][2]; + int8_t topright_ref[2][3]; } cache; /* */ @@ -739,9 +787,9 @@ int i_chroma_lambda2_offset; /* B_direct and weighted prediction */ - int16_t dist_scale_factor_buf[2][X264_REF_MAX*2][4]; + int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4]; int16_t (*dist_scale_factor)[4]; - int8_t bipred_weight_buf[2][X264_REF_MAX*2][4]; + int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4]; int8_t (*bipred_weight)[4]; /* maps fref1[0]'s ref indices into the current list0 */ #define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2] @@ -776,6 +824,7 @@ int i_mb_partition[17]; int i_mb_cbp[6]; int i_mb_pred_mode[4][13]; + int i_mb_field[3]; /* Adaptive direct mv pred */ int i_direct_score[2]; /* Metrics */ @@ -805,6 +854,7 @@ int64_t i_mb_count_ref[2][2][X264_REF_MAX*2]; int64_t i_mb_cbp[6]; int64_t i_mb_pred_mode[4][13]; + int64_t i_mb_field[3]; /* */ int i_direct_score[2]; int i_direct_frames[2]; @@ -824,8 +874,10 @@ /* Buffers that are allocated per-thread even in sliced threads. */ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */ - pixel *intra_border_backup[2][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ - uint8_t (*deblock_strength[2])[2][4][4]; + pixel *intra_border_backup[5][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ + /* Deblock strength values are stored for each 4x4 partition. In MBAFF + * there are four extra values that need to be stored, located in [4][i]. */ + uint8_t (*deblock_strength[2])[2][8][4]; /* CPU functions dependents */ x264_predict_t predict_16x16[4+3]; @@ -838,6 +890,8 @@ x264_mc_functions_t mc; x264_dct_function_t dctf; x264_zigzag_function_t zigzagf; + x264_zigzag_function_t zigzagf_interlaced; + x264_zigzag_function_t zigzagf_progressive; x264_quant_function_t quantf; x264_deblock_function_t loopf; x264_bitstream_function_t bsf; @@ -850,11 +904,12 @@ // included at the end because it needs x264_t #include "macroblock.h" -#include "rectangle.h" -#if HAVE_MMX +#if ARCH_X86 || ARCH_X86_64 #include "x86/util.h" #endif +#include "rectangle.h" + #endif

@@ -45,7 +45,8 @@ #include <machine/cpu.h> #endif -const x264_cpu_name_t x264_cpu_names[] = { +const x264_cpu_name_t x264_cpu_names[] = +{ {"Altivec", X264_CPU_ALTIVEC}, // {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore {"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT}, @@ -357,9 +358,12 @@ #if !HAVE_THREAD return 1; -#elif defined(_WIN32) +#elif SYS_WINDOWS return x264_pthread_num_processors_np(); +#elif SYS_CYGWIN + return sysconf( _SC_NPROCESSORS_ONLN ); + #elif SYS_LINUX unsigned int bit; int np;

@@ -31,7 +31,16 @@ void x264_cpu_emms( void ); void x264_cpu_sfence( void ); #if HAVE_MMX +/* There is no way to forbid the compiler from using float instructions + * before the emms so miscompilation could theoretically occur in the + * unlikely event that the compiler reorders emms and float instructions. */ +#if HAVE_X86_INLINE_ASM +/* Clobbering memory makes the compiler less likely to reorder code. */ +#define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \ + "st(3)","st(4)","st(5)","st(6)","st(7)" ) +#else #define x264_emms() x264_cpu_emms() +#endif #else #define x264_emms() #endif @@ -53,9 +62,10 @@ #define x264_stack_align(func,...) func(__VA_ARGS__) #endif -typedef struct { +typedef struct +{ const char name[16]; - int flags; + uint32_t flags; } x264_cpu_name_t; extern const x264_cpu_name_t x264_cpu_names[];

@@ -746,123 +746,117 @@ } } -void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) +void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ) { - if( b_interlaced ) - { - pf->scan_8x8 = zigzag_scan_8x8_field; - pf->scan_4x4 = zigzag_scan_4x4_field; - pf->sub_8x8 = zigzag_sub_8x8_field; - pf->sub_4x4 = zigzag_sub_4x4_field; - pf->sub_4x4ac = zigzag_sub_4x4ac_field; + pf_interlaced->scan_8x8 = zigzag_scan_8x8_field; + pf_progressive->scan_8x8 = zigzag_scan_8x8_frame; + pf_interlaced->scan_4x4 = zigzag_scan_4x4_field; + pf_progressive->scan_4x4 = zigzag_scan_4x4_frame; + pf_interlaced->sub_8x8 = zigzag_sub_8x8_field; + pf_progressive->sub_8x8 = zigzag_sub_8x8_frame; + pf_interlaced->sub_4x4 = zigzag_sub_4x4_field; + pf_progressive->sub_4x4 = zigzag_sub_4x4_frame; + pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field; + pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame; + #if HIGH_BIT_DEPTH #if HAVE_MMX - if( cpu&X264_CPU_SSE2 ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; - if( cpu&X264_CPU_SSE4 ) - pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; - if( cpu&X264_CPU_AVX ) - pf->scan_8x8 = x264_zigzag_scan_8x8_field_avx; -#endif // HAVE_MMX -#else -#if HAVE_MMX - if( cpu&X264_CPU_MMXEXT ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; - pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext; - } - if( cpu&X264_CPU_SSSE3 ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3; - } - if( cpu&X264_CPU_AVX ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_field_avx; -#if ARCH_X86_64 - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_avx; -#endif - } -#endif // HAVE_MMX -#if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; -#endif -#endif // HIGH_BIT_DEPTH - } - else + if( cpu&X264_CPU_SSE2 ) { - pf->scan_8x8 = zigzag_scan_8x8_frame; - pf->scan_4x4 = zigzag_scan_4x4_frame; - pf->sub_8x8 = zigzag_sub_8x8_frame; - pf->sub_4x4 = zigzag_sub_4x4_frame; - pf->sub_4x4ac = zigzag_sub_4x4ac_frame; -#if HIGH_BIT_DEPTH -#if HAVE_MMX - if( cpu&X264_CPU_SSE2 ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; - } + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; + } + if( cpu&X264_CPU_SSE4 ) + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; + if( cpu&X264_CPU_AVX ) + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx; #if ARCH_X86_64 - if( cpu&X264_CPU_AVX ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; - } + if( cpu&X264_CPU_AVX ) + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; + } #endif // ARCH_X86_64 #endif // HAVE_MMX #else #if HAVE_MMX - if( cpu&X264_CPU_MMX ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; - if( cpu&X264_CPU_MMXEXT ) - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; - if( cpu&X264_CPU_SSE2_IS_FAST ) - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; - if( cpu&X264_CPU_SSSE3 ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; - } - if( cpu&X264_CPU_AVX ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; + if( cpu&X264_CPU_MMX ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; + if( cpu&X264_CPU_MMXEXT ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; + } + if( cpu&X264_CPU_SSE2_IS_FAST ) + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; + if( cpu&X264_CPU_SSSE3 ) + { + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; + pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; + } + if( cpu&X264_CPU_AVX ) + { + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; #if ARCH_X86_64 - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; + pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; #endif - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; - } + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + } #endif // HAVE_MMX #if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; + if( cpu&X264_CPU_ALTIVEC ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; + } #endif #if HAVE_ARMV6 - if( cpu&X264_CPU_NEON ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; + if( cpu&X264_CPU_NEON ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; #endif #endif // HIGH_BIT_DEPTH - } - pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; #if HAVE_MMX #if HIGH_BIT_DEPTH if( cpu&X264_CPU_SSE2 ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + } if( cpu&X264_CPU_AVX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + } #else if( cpu&X264_CPU_MMX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; + } if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + } + if( cpu&X264_CPU_AVX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + } #endif // HIGH_BIT_DEPTH #endif }

@@ -132,6 +132,6 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ); void x264_dct_init_weights( void ); -void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ); +void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ); #endif

@@ -75,6 +75,37 @@ #define tc0_table(x) i_tc0_table[(x)+24] /* From ffmpeg */ +static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc0 ) +{ + int p2 = pix[-3*xstride]; + int p1 = pix[-2*xstride]; + int p0 = pix[-1*xstride]; + int q0 = pix[ 0*xstride]; + int q1 = pix[ 1*xstride]; + int q2 = pix[ 2*xstride]; + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + int tc = tc0; + int delta; + if( abs( p2 - p0 ) < beta ) + { + if( tc0 ) + pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 ); + tc++; + } + if( abs( q2 - q0 ) < beta ) + { + if( tc0 ) + pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 ); + tc++; + } + + delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */ + pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ + } +} static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) @@ -84,40 +115,15 @@ pix += 4*ystride; continue; } - for( int d = 0; d < 4; d++ ) - { - int p2 = pix[-3*xstride]; - int p1 = pix[-2*xstride]; - int p0 = pix[-1*xstride]; - int q0 = pix[ 0*xstride]; - int q1 = pix[ 1*xstride]; - int q2 = pix[ 2*xstride]; - - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) - { - int tc = tc0[i]; - int delta; - if( abs( p2 - p0 ) < beta ) - { - if( tc0[i] ) - pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] ); - tc++; - } - if( abs( q2 - q0 ) < beta ) - { - if( tc0[i] ) - pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] ); - tc++; - } - - delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */ - pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ - } - pix += ystride; - } + for( int d = 0; d < 4; d++, pix += ystride ) + deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] ); } } +static inline void deblock_v_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + for( int d = 0; d < 8; d++, pix += stride ) + deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] ); +} static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); @@ -127,6 +133,20 @@ deblock_luma_c( pix, 1, stride, alpha, beta, tc0 ); } +static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc ) +{ + int p1 = pix[-2*xstride]; + int p0 = pix[-1*xstride]; + int q0 = pix[ 0*xstride]; + int q1 = pix[ 1*xstride]; + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */ + pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ + } +} static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) @@ -139,21 +159,14 @@ } for( int d = 0; d < 2; d++, pix += ystride-2 ) for( int e = 0; e < 2; e++, pix++ ) - { - int p1 = pix[-2*xstride]; - int p0 = pix[-1*xstride]; - int q0 = pix[ 0*xstride]; - int q1 = pix[ 1*xstride]; - - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) - { - int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */ - pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ - } - } + deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); } } +static inline void deblock_v_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + for( int i = 0; i < 4; i++, pix += stride ) + deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] ); +} static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); @@ -163,49 +176,55 @@ deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); } -static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta ) +static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta ) { - for( int d = 0; d < 16; d++ ) - { - int p2 = pix[-3*xstride]; - int p1 = pix[-2*xstride]; - int p0 = pix[-1*xstride]; - int q0 = pix[ 0*xstride]; - int q1 = pix[ 1*xstride]; - int q2 = pix[ 2*xstride]; + int p2 = pix[-3*xstride]; + int p1 = pix[-2*xstride]; + int p0 = pix[-1*xstride]; + int q0 = pix[ 0*xstride]; + int q1 = pix[ 1*xstride]; + int q2 = pix[ 2*xstride]; - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + if( abs( p0 - q0 ) < ((alpha >> 2) + 2) ) { - if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ) + if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */ { - if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */ - { - const int p3 = pix[-4*xstride]; - pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; - pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; - pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; - } - else /* p0' */ - pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */ - { - const int q3 = pix[3*xstride]; - pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; - pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; - pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; - } - else /* q0' */ - pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + const int p3 = pix[-4*xstride]; + pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; + pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; + pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; } - else /* p0', q0' */ - { + else /* p0' */ pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */ + { + const int q3 = pix[3*xstride]; + pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; + pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; + pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; } + else /* q0' */ + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + else /* p0', q0' */ + { + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; } - pix += ystride; } } +static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta ) +{ + for( int d = 0; d < 16; d++, pix += ystride ) + deblock_edge_luma_intra_c( pix, xstride, alpha, beta ); +} +static inline void deblock_v_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta ) +{ + for( int d = 0; d < 8; d++, pix += ystride ) + deblock_edge_luma_intra_c( pix, 1, alpha, beta ); +} static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta ) { deblock_luma_intra_c( pix, stride, 1, alpha, beta ); @@ -215,22 +234,29 @@ deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } +static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, int alpha, int beta ) +{ + int p1 = pix[-2*xstride]; + int p0 = pix[-1*xstride]; + int q0 = pix[ 0*xstride]; + int q1 = pix[ 1*xstride]; + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ + pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ + } +} static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir ) { for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) for( int e = 0; e < (dir?1:2); e++, pix++ ) - { - int p1 = pix[-2*xstride]; - int p0 = pix[-1*xstride]; - int q0 = pix[ 0*xstride]; - int q1 = pix[ 1*xstride]; - - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) - { - pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ - pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ - } - } + deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); +} +static inline void deblock_v_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta ) +{ + for( int i = 0; i < 4; i++, pix += stride ) + deblock_edge_chroma_intra_c( pix, 2, alpha, beta ); } static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) { @@ -242,8 +268,8 @@ } static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, - int bframe ) + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, + int bframe, x264_t *h ) { for( int dir = 0; dir < 2; dir++ ) { @@ -270,6 +296,162 @@ } } +void deblock_strength_mbaff_c( uint8_t nnz_cache[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t *h ) +{ + int neighbour_field[2]; + neighbour_field[0] = h->mb.i_mb_left_xy[0] >= 0 && h->mb.field[h->mb.i_mb_left_xy[0]]; + neighbour_field[1] = h->mb.i_mb_top_xy >= 0 && h->mb.field[h->mb.i_mb_top_xy]; + int intra_cur = IS_INTRA( h->mb.i_type ); + + if( !intra_cur ) + { + for( int dir = 0; dir < 2; dir++ ) + { + int edge_stride = dir ? 8 : 1; + int part_stride = dir ? 1 : 8; + for( int edge = 0; edge < 4; edge++ ) + { + for( int i = 0, q = X264_SCAN8_0+edge*edge_stride; i < 4; i++, q += part_stride ) + { + int p = q - edge_stride; + if( nnz_cache[q] || nnz_cache[p] ) + { + bs[dir][edge][i] = 2; + } + else if( (edge == 0 && MB_INTERLACED != neighbour_field[dir]) || + ref[0][q] != ref[0][p] || + abs( mv[0][q][0] - mv[0][p][0] ) >= 4 || + abs( mv[0][q][1] - mv[0][p][1] ) >= mvy_limit || + (bframe && (ref[1][q] != ref[1][p] || + abs( mv[1][q][0] - mv[1][p][0] ) >= 4 || + abs( mv[1][q][1] - mv[1][p][1] ) >= mvy_limit )) ) + { + bs[dir][edge][i] = 1; + } + else + bs[dir][edge][i] = 0; + } + } + } + } + + if( h->mb.i_neighbour & MB_LEFT ) + { + if( h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) + { + static const uint8_t offset[2][2][8] = { + { { 0, 0, 0, 0, 1, 1, 1, 1 }, + { 2, 2, 2, 2, 3, 3, 3, 3 }, }, + { { 0, 1, 2, 3, 0, 1, 2, 3 }, + { 0, 1, 2, 3, 0, 1, 2, 3 }, } + }; + uint8_t bS[8]; + + if( intra_cur ) + memset( bS, 4, 8 ); + else + { + const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1]; + uint8_t (*nnz)[24] = h->mb.non_zero_count; + + for( int i = 0; i < 8; i++ ) + { + int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1]; + int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)]; + int nnz_left = nnz[left][3 + 4*off[i]]; + if( !h->param.b_cabac && h->pps->b_transform_8x8_mode ) + { + int j = off[i]&~1; + if( h->mb.mb_transform_size[left] ) + nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] )); + } + if( IS_INTRA( h->mb.type[left] ) ) + bS[i] = 4; + else if( nnz_left || nnz_this ) + bS[i] = 2; + else // As left is different interlaced. + bS[i] = 1; + } + } + + if( MB_INTERLACED ) + { + for( int i = 0; i < 4; i++ ) bs[0][0][i] = bS[i]; + for( int i = 0; i < 4; i++ ) bs[0][4][i] = bS[4+i]; + } + else + { + for( int i = 0; i < 4; i++ ) bs[0][0][i] = bS[2*i]; + for( int i = 0; i < 4; i++ ) bs[0][4][i] = bS[1+2*i]; + } + } + } + + if( h->mb.i_neighbour & MB_TOP ) + { + if( !(h->mb.i_mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] ) + { + /* Need to filter both fields (even for frame macroblocks). + * Filter top two rows using the top macroblock of the above + * pair and then the bottom one. */ + int mbn_xy = h->mb.i_mb_xy - 2 * h->mb.i_mb_stride; + uint32_t nnz_cur[4]; + nnz_cur[0] = h->mb.cache.non_zero_count[x264_scan8[0]+0]; + nnz_cur[1] = h->mb.cache.non_zero_count[x264_scan8[0]+1]; + nnz_cur[2] = h->mb.cache.non_zero_count[x264_scan8[0]+2]; + nnz_cur[3] = h->mb.cache.non_zero_count[x264_scan8[0]+3]; + /* Munge NNZ for cavlc + 8x8dct */ + if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && + h->mb.mb_transform_size[h->mb.i_mb_xy] ) + { + int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); + int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] ); + nnz_cur[0] = nnz_cur[1] = !!nnz0; + nnz_cur[2] = nnz_cur[3] = !!nnz1; + } + + for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride ) + { + int mbn_intra = IS_INTRA( h->mb.type[mbn_xy] ); + uint8_t (*nnz)[24] = h->mb.non_zero_count; + + uint32_t nnz_top[4]; + nnz_top[0] = nnz[mbn_xy][3*4+0]; + nnz_top[1] = nnz[mbn_xy][3*4+1]; + nnz_top[2] = nnz[mbn_xy][3*4+2]; + nnz_top[3] = nnz[mbn_xy][3*4+3]; + + if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && + (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[mbn_xy] ) + { + int nnz_top0 = M16( &nnz[mbn_xy][8] ) | M16( &nnz[mbn_xy][12] ); + int nnz_top1 = M16( &nnz[mbn_xy][10] ) | M16( &nnz[mbn_xy][14] ); + nnz_top[0] = nnz_top[1] = nnz_top0 ? 0x0101 : 0; + nnz_top[2] = nnz_top[3] = nnz_top1 ? 0x0101 : 0; + } + + uint8_t bS[4]; + if( intra_cur || mbn_intra ) + M32( bS ) = 0x03030303; + else + { + for( int i = 0; i < 4; i++ ) + { + if( nnz_cur[i] || nnz_top[i] ) + bS[i] = 2; + else + bS[i] = 1; + } + } + for( int i = 0; i < 4; i++ ) + bs[1][4*j][i] = bS[i]; + } + } + } +} + static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; @@ -304,12 +486,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) { - int b_interlaced = h->sh.b_mbaff; + int b_interlaced = SLICE_MBAFF; int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset ); int stridey = h->fdec->i_stride[0]; - int stride2y = stridey << b_interlaced; int strideuv = h->fdec->i_stride[1]; - int stride2uv = strideuv << b_interlaced; for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced ) { @@ -319,16 +499,18 @@ int mb_xy = h->mb.i_mb_xy; int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy]; int intra_cur = IS_INTRA( h->mb.type[mb_xy] ); - uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&1][mb_x]; + uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x]; pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; pixel *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x; - if( mb_y & b_interlaced ) + if( mb_y & MB_INTERLACED ) { pixy -= 15*stridey; pixuv -= 7*strideuv; } + int stride2y = stridey << MB_INTERLACED; + int stride2uv = strideuv << MB_INTERLACED; int qp = h->mb.qp[mb_xy]; int qpc = h->chroma_qp_table[qp]; int first_edge_only = h->mb.type[mb_xy] == P_SKIP || qp <= qp_thresh; @@ -347,16 +529,59 @@ if( h->mb.i_neighbour & MB_LEFT ) { - int qpl = h->mb.qp[h->mb.i_mb_left_xy]; - int qp_left = (qp + qpl + 1) >> 1; - int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1; - int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_left_xy] ); - if( intra_cur || intra_left ) - FILTER( _intra, 0, 0, qp_left, qpc_left ); + if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) + { + int luma_qp[2]; + int chroma_qp[2]; + int left_qp[2]; + int current_qp = h->mb.qp[mb_xy]; + left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]]; + luma_qp[0] = (current_qp + left_qp[0] + 1) >> 1; + chroma_qp[0] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[0]] + 1) >> 1; + if( bs[0][0][0] == 4) + { + deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], 0, deblock_v_luma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv + 1, 2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c ); + } + else + { + deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], 0, deblock_v_luma_mbaff_c ); + deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c ); + deblock_edge( h, pixuv + 1, 2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c ); + } + + int offy = MB_INTERLACED ? 4 : 0; + int offuv = MB_INTERLACED ? 3 : 0; + left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]]; + luma_qp[1] = (current_qp + left_qp[1] + 1) >> 1; + chroma_qp[1] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[1]] + 1) >> 1; + if( bs[0][4][0] == 4) + { + deblock_edge_intra( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], 0, deblock_v_luma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv + 1 + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c ); + } + else + { + deblock_edge( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], 0, deblock_v_luma_mbaff_c ); + deblock_edge( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_mbaff_c ); + deblock_edge( h, pixuv + 1 + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_mbaff_c ); + } + } else - FILTER( , 0, 0, qp_left, qpc_left ); - } + { + int qpl = h->mb.qp[h->mb.i_mb_xy-1]; + int qp_left = (qp + qpl + 1) >> 1; + int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1; + int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] ); + if( intra_cur || intra_left ) + FILTER( _intra, 0, 0, qp_left, qpc_left ); + else + FILTER( , 0, 0, qp_left, qpc_left ); + } + } if( !first_edge_only ) { if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc ); @@ -366,17 +591,39 @@ if( h->mb.i_neighbour & MB_TOP ) { - int qpt = h->mb.qp[h->mb.i_mb_top_xy]; - int qp_top = (qp + qpt + 1) >> 1; - int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; - int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] ); - if( ~b_interlaced & (intra_cur | intra_top) ) - FILTER( _intra, 1, 0, qp_top, qpc_top ); + if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] ) + { + int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride; + + for(int j=0; j<2; j++, mbn_xy += h->mb.i_mb_stride) + { + int qpt = h->mb.qp[mbn_xy]; + int qp_top = (qp + qpt + 1) >> 1; + int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + + // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows + deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, 0, deblock_v_luma_c ); + deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 1, deblock_v_chroma_c ); + } + } else { - if( intra_top ) - M32( bs[1][0] ) = 0x03030303; - FILTER( , 1, 0, qp_top, qpc_top ); + int qpt = h->mb.qp[h->mb.i_mb_top_xy]; + int qp_top = (qp + qpt + 1) >> 1; + int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] ); + + if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) + && (intra_cur || intra_top) ) + { + FILTER( _intra, 1, 0, qp_top, qpc_top ); + } + else + { + if( intra_top ) + M32( bs[1][0] ) = 0x03030303; + FILTER( , 1, 0, qp_top, qpc_top ); + } } } @@ -401,17 +648,17 @@ */ void x264_macroblock_deblock( x264_t *h ) { - int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset ); + int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); int qp = h->mb.i_qp; if( qp <= qp_thresh || h->mb.i_type == P_SKIP ) return; - uint8_t (*bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; + uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; if( IS_INTRA( h->mb.i_type ) ) - memset( bs, 3, 2*4*4*sizeof(uint8_t) ); + memset( bs, 3, 2*8*4*sizeof(uint8_t) ); else h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, - bs, 4 >> h->sh.b_mbaff, h->sh.i_type == SLICE_TYPE_B ); + bs, 4 >> SLICE_MBAFF, h->sh.i_type == SLICE_TYPE_B, h ); int transform_8x8 = h->mb.b_transform_8x8; pixel *fdec = h->mb.pic.p_fdec[0]; @@ -453,17 +700,17 @@ void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t *h ); void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t *h ); void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t *h ); void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t *h ); #if ARCH_X86 void x264_deblock_h_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); @@ -505,7 +752,7 @@ void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * ); #endif -void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) +void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) { pf->deblock_luma[1] = deblock_v_luma_c; pf->deblock_luma[0] = deblock_h_luma_c; @@ -585,4 +832,6 @@ } #endif #endif // !HIGH_BIT_DEPTH + + if( b_mbaff ) pf->deblock_strength = deblock_strength_mbaff_c; }

@@ -48,7 +48,7 @@ int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines; - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int luma_plane_size, chroma_plane_size; int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16; int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10; @@ -100,20 +100,35 @@ CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) ); frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH; + if( PARAM_INTERLACED ) + { + CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) ); + frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH; + } /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ if( h->param.analyse.i_subpel_refine && b_fdec ) { + /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */ CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size * sizeof(pixel) ); + if( PARAM_INTERLACED ) + CHECKED_MALLOC( frame->buffer_fld[0], 4*luma_plane_size * sizeof(pixel) ); for( int i = 0; i < 4; i++ ) + { frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH; + frame->filtered_fld[i] = frame->buffer_fld[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH; + } frame->plane[0] = frame->filtered[0]; + frame->plane_fld[0] = frame->filtered_fld[0]; } else { CHECKED_MALLOC( frame->buffer[0], luma_plane_size * sizeof(pixel) ); + if( PARAM_INTERLACED ) + CHECKED_MALLOC( frame->buffer_fld[0], luma_plane_size * sizeof(pixel) ); frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH; + frame->filtered_fld[0] = frame->plane_fld[0] = frame->buffer_fld[0] + frame->i_stride[0] * i_padv + PADH; } frame->b_duplicate = 0; @@ -139,12 +154,15 @@ } CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) ); + CHECKED_MALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) ); if( h->param.analyse.i_me_method >= X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; } + if( PARAM_INTERLACED ) + CHECKED_MALLOC( frame->field, i_mb_count * sizeof(uint8_t) ); } else /* fenc frame */ { @@ -162,7 +180,7 @@ CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); } - CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) ); + CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); @@ -198,7 +216,10 @@ if( !frame->b_duplicate ) { for( int i = 0; i < 4; i++ ) + { x264_free( frame->buffer[i] ); + x264_free( frame->buffer_fld[i] ); + } for( int i = 0; i < 4; i++ ) x264_free( frame->buffer_lowres[i] ); for( int i = 0; i < X264_BFRAME_MAX+2; i++ ) @@ -219,6 +240,8 @@ x264_free( frame->i_inv_qscale_factor ); x264_free( frame->i_row_bits ); x264_free( frame->f_row_qp ); + x264_free( frame->f_row_qscale ); + x264_free( frame->field ); x264_free( frame->mb_type ); x264_free( frame->mb_partition ); x264_free( frame->mv[0] ); @@ -310,23 +333,56 @@ static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size ) { uint8_t *dstp = (uint8_t*)dst; - if( size == 1 ) - memset(dst, *src, len); - else if( size == 2 ) + uint8_t v1 = *src; + uint16_t v2 = size == 1 ? v1 + (v1 << 8) : M16( src ); + uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src ); + int i = 0; + len *= size; + + /* Align the input pointer if it isn't already */ + if( (intptr_t)dstp & (WORD_SIZE - 1) ) + { + if( size <= 2 && ((intptr_t)dstp & 3) ) + { + if( size == 1 && ((intptr_t)dstp & 1) ) + dstp[i++] = v1; + if( (intptr_t)dstp & 2 ) + { + M16( dstp+i ) = v2; + i += 2; + } + } + if( WORD_SIZE == 8 && (intptr_t)dstp & 4 ) + { + M32( dstp+i ) = v4; + i += 4; + } + } + + /* Main copy loop */ + if( WORD_SIZE == 8 ) { - int v = M16( src ); - for( int i = 0; i < len; i++ ) - M16( dstp+i*2 ) = v; + uint64_t v8 = v4 + ((uint64_t)v4<<32); + for( ; i < len - 7; i+=8 ) + M64( dstp+i ) = v8; } - else if( size == 4 ) + for( ; i < len - 3; i+=4 ) + M32( dstp+i ) = v4; + + /* Finish up the last few bytes */ + if( size <= 2 ) { - int v = M32( src ); - for( int i = 0; i < len; i++ ) - M32( dstp+i*4 ) = v; + if( i < len - 1 ) + { + M16( dstp+i ) = v2; + i += 2; + } + if( size == 1 && i != len ) + dstp[i] = v1; } } -static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) +static void ALWAYS_INLINE plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) { #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride ) for( int y = 0; y < i_height; y++ ) @@ -350,26 +406,35 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) { int b_start = !mb_y; - if( mb_y & h->sh.b_mbaff ) + if( mb_y & SLICE_MBAFF ) return; for( int i = 0; i < frame->i_plane; i++ ) { int stride = frame->i_stride[i]; int width = 16*h->sps->i_mb_width; - int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i; + int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> !!i; int padh = PADH; int padv = PADV >> !!i; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb - pixel *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); if( b_end && !b_start ) - height += 4 >> (!!i + h->sh.b_mbaff); - if( h->sh.b_mbaff ) + height += 4 >> (!!i + SLICE_MBAFF); + pixel *pix; + if( SLICE_MBAFF ) { + // border samples for each field are extended separately + pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i ); plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i ); + + height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> !!i; + if( b_end && !b_start ) + height += 4 >> (!!i); + pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } else { + pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } } @@ -383,20 +448,22 @@ int b_start = !mb_y; int stride = frame->i_stride[0]; int width = 16*h->mb.i_mb_width + 8; - int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16; + int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16; int padh = PADH - 4; int padv = PADV - 8; for( int i = 1; i < 4; i++ ) { // buffer: 8 luma, to match the hpel filter - pixel *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4; - if( h->sh.b_mbaff ) + pixel *pix; + if( SLICE_MBAFF ) { + pix = frame->filtered_fld[i] + (16*mb_y - 16) * stride - 4; plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 ); plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 ); } - else - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 ); + + pix = frame->filtered[i] + (16*mb_y - 8) * stride - 4; + plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 ); } } @@ -426,12 +493,28 @@ { for( int y = i_height; y < i_height + i_pady; y++ ) memcpy( &frame->plane[i][y*frame->i_stride[i]], - &frame->plane[i][(i_height-(~y&h->param.b_interlaced)-1)*frame->i_stride[i]], + &frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]], (i_width + i_padx) * sizeof(pixel) ); } } } +void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y ) +{ + for( int i = 0; i < h->fenc->i_plane; i++ ) + { + int stride = h->fenc->i_stride[i]; + int height = h->param.i_height >> !!i; + int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> !!i; + int mbsize = (16>>!!i); + pixel *fenc = h->fenc->plane[i] + mbsize * mb_x; + for( int y = height; y < height + pady; y++ ) + memcpy( fenc + y*stride, + fenc + (height-1)*stride, + mbsize * sizeof(pixel) ); + } +} + /* threading */ void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ) {

@@ -72,13 +72,16 @@ int i_width_lowres; int i_lines_lowres; pixel *plane[2]; + pixel *plane_fld[2]; pixel *filtered[4]; /* plane[0], H, V, HV */ + pixel *filtered_fld[4]; pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */ uint16_t *integral; /* for unrestricted mv we allocate more data than needed * allocated data are stored in buffer */ pixel *buffer[4]; + pixel *buffer_fld[4]; pixel *buffer_lowres[4]; x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */ @@ -92,6 +95,7 @@ int16_t (*mv[2])[2]; int16_t (*mv16x16)[2]; int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2]; + uint8_t *field; /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost). * Doesn't need special addressing for intra cost because @@ -117,6 +121,7 @@ int *i_row_satd; int *i_row_bits; float *f_row_qp; + float *f_row_qscale; float *f_qp_offset; float *f_qp_offset_aq; int b_intra_calculated; @@ -178,8 +183,8 @@ x264_deblock_intra_t deblock_luma_intra[2]; x264_deblock_intra_t deblock_chroma_intra[2]; void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, - int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, + int bframe, x264_t *h ); } x264_deblock_function_t; x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ); @@ -191,6 +196,7 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ); void x264_frame_expand_border_lowres( x264_frame_t *frame ); void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ); +void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y ); void x264_frame_deblock_row( x264_t *h, int mb_y ); void x264_macroblock_deblock( x264_t *h ); @@ -198,7 +204,7 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ); void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame ); -void x264_deblock_init( int cpu, x264_deblock_function_t *pf ); +void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ); void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ); void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );

@@ -40,7 +40,7 @@ mvx, mvy, 4*width, 4*height, &h->sh.weight[i_ref][0] ); // chroma is offset if MCing from a field of opposite parity - if( h->mb.b_interlaced & i_ref ) + if( MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], @@ -69,7 +69,7 @@ h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0], mvx, mvy, 4*width, 4*height, weight_none ); - if( h->mb.b_interlaced & i_ref ) + if( MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], @@ -101,9 +101,9 @@ h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, src0, i_stride0, src1, i_stride1, weight ); - if( h->mb.b_interlaced & i_ref0 ) + if( MB_INTERLACED & i_ref0 ) mvy0 += (h->mb.i_mb_y & 1)*4 - 2; - if( h->mb.b_interlaced & i_ref1 ) + if( MB_INTERLACED & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], @@ -212,7 +212,7 @@ h->mb.i_b8_stride = h->mb.i_mb_width * 2; h->mb.i_b4_stride = h->mb.i_mb_width * 4; - h->mb.b_interlaced = h->param.b_interlaced; + h->mb.b_interlaced = PARAM_INTERLACED; CHECKED_MALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); @@ -236,7 +236,7 @@ for( int i = 0; i < 2; i++ ) { - int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced; + int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED; if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit @@ -250,7 +250,7 @@ if( h->param.analyse.i_weighted_pred ) { - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int luma_plane_size = 0; int numweightbuf; @@ -314,18 +314,22 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) { if( !b_lookahead ) - for( int i = 0; i <= h->param.b_interlaced; i++ ) - { + { + for( int i = 0; i <= 4*PARAM_INTERLACED; i++ ) for( int j = 0; j < 2; j++ ) { /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */ CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) ); h->intra_border_backup[i][j] += 16; - h->intra_border_backup[1][j] = h->intra_border_backup[i][j]; + if( !PARAM_INTERLACED ) + h->intra_border_backup[1][j] = h->intra_border_backup[i][j]; } + for( int i = 0; i <= PARAM_INTERLACED; i++ ) + { CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width ); h->deblock_strength[1] = h->deblock_strength[i]; } + } /* Allocate scratch buffer */ int scratch_size = 0; @@ -338,7 +342,7 @@ ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+3)&~3) * sizeof(int); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); @@ -353,12 +357,13 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead ) { if( !b_lookahead ) - for( int i = 0; i <= h->param.b_interlaced; i++ ) - { + { + for( int i = 0; i <= PARAM_INTERLACED; i++ ) x264_free( h->deblock_strength[i] ); + for( int i = 0; i <= 4*PARAM_INTERLACED; i++ ) for( int j = 0; j < 2; j++ ) x264_free( h->intra_border_backup[i][j] - 16 ); - } + } x264_free( h->scratch_buffer ); } @@ -371,6 +376,7 @@ h->mb.ref[1] = h->fdec->ref[1]; h->mb.type = h->fdec->mb_type; h->mb.partition = h->fdec->mb_partition; + h->mb.field = h->fdec->field; h->fdec->i_ref[0] = h->i_ref[0]; h->fdec->i_ref[1] = h->i_ref[1]; @@ -403,12 +409,12 @@ { deblock_ref_table(-2) = -2; deblock_ref_table(-1) = -1; - for( int i = 0; i < h->i_ref[0] << h->sh.b_mbaff; i++ ) + for( int i = 0; i < h->i_ref[0] << SLICE_MBAFF; i++ ) { /* Mask off high bits to avoid frame num collisions with -1/-2. * In current x264 frame num values don't cover a range of more * than 32, so 6 bits is enough for uniqueness. */ - if( !h->mb.b_interlaced ) + if( !MB_INTERLACED ) deblock_ref_table(i) = h->fref[0][i]->i_frame_num&63; else deblock_ref_table(i) = ((h->fref[0][i>>1]->i_frame_num&63)<<1) + (i&1); @@ -420,7 +426,7 @@ memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) ); if( h->i_ref[0] > 0 ) - for( int field = 0; field <= h->sh.b_mbaff; field++ ) + for( int field = 0; field <= SLICE_MBAFF; field++ ) { int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; int refpoc = h->fref[0][0]->i_poc + h->fref[0][0]->i_delta_poc[field]; @@ -452,7 +458,7 @@ (h->sh.i_type == SLICE_TYPE_B && h->mb.i_subpel_refine >= 9)); h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I); - + h->mb.i_mb_prev_xy = -1; /* fdec: fenc: * yyyyyyy @@ -489,18 +495,20 @@ dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE]; } -static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_interlaced ) +static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_mbaff ) { int w = (i ? 8 : 16); int i_stride = h->fdec->i_stride[i]; - int i_stride2 = i_stride << b_interlaced; - int i_pix_offset = b_interlaced + int i_stride2 = i_stride << MB_INTERLACED; + int i_pix_offset = MB_INTERLACED ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride : 16 * mb_x + w * mb_y * i_stride; pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset]; - pixel *intra_fdec = &h->intra_border_backup[mb_y&1][i][mb_x*16]; + int fdec_idx = b_mbaff ? (MB_INTERLACED ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0; + pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16]; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; - if( b_interlaced ) + /* ref_pix_offset[0] references the current field and [1] the opposite field. */ + if( MB_INTERLACED ) ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; @@ -509,13 +517,20 @@ h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 ); memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) ); memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) ); + if( b_mbaff ) + { + h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8]; + h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1]; + } } else { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 ); memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) ); + if( b_mbaff ) + h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = intra_fdec[-1]; } - if( b_interlaced ) + if( b_mbaff ) { for( int j = 0; j < w; j++ ) if( i ) @@ -526,15 +541,28 @@ else h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; } + pixel *plane_src, **filtered_src; for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) { - h->mb.pic.p_fref[0][j][i?4:0] = &h->fref[0][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; + // Interpolate between pixels in same field. + if( MB_INTERLACED ) + { + plane_src = h->fref[0][j>>1]->plane_fld[i]; + filtered_src = h->fref[0][j>>1]->filtered_fld; + } + else + { + plane_src = h->fref[0][j]->plane[i]; + filtered_src = h->fref[0][j]->filtered; + } + h->mb.pic.p_fref[0][j][i?4:0] = plane_src + ref_pix_offset[j&1]; + if( !i ) { for( int k = 1; k < 4; k++ ) - h->mb.pic.p_fref[0][j][k] = &h->fref[0][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[0][j][k] = filtered_src[k] + ref_pix_offset[j&1]; if( h->sh.weight[j][0].weightfn ) - h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> b_interlaced][ref_pix_offset[j&1]]; + h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> MB_INTERLACED][ref_pix_offset[j&1]]; else h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0]; } @@ -542,57 +570,160 @@ if( h->sh.i_type == SLICE_TYPE_B ) for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) { - h->mb.pic.p_fref[1][j][i?4:0] = &h->fref[1][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( MB_INTERLACED ) + { + plane_src = h->fref[1][j>>1]->plane_fld[i]; + filtered_src = h->fref[1][j>>1]->filtered_fld; + } + else + { + plane_src = h->fref[1][j]->plane[i]; + filtered_src = h->fref[1][j]->filtered; + } + h->mb.pic.p_fref[1][j][i?4:0] = plane_src + ref_pix_offset[j&1]; + if( !i ) for( int k = 1; k < 4; k++ ) - h->mb.pic.p_fref[1][j][k] = &h->fref[1][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[1][j][k] = filtered_src[k] + ref_pix_offset[j&1]; } } -static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y ) +static const x264_left_table_t left_indices[4] = { - int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x; + /* Current is progressive */ + {{ 4, 4, 5, 5}, { 3, 3, 7, 7}, {16+1, 16+1, 16+4+1, 16+4+1}, {0, 0, 1, 1}, {0, 0, 0, 0}}, + {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+3, 16+3, 16+4+3, 16+4+3}, {2, 2, 3, 3}, {1, 1, 1, 1}}, + /* Current is interlaced */ + {{ 4, 6, 4, 6}, { 3, 11, 3, 11}, {16+1, 16+1, 16+4+1, 16+4+1}, {0, 2, 0, 2}, {0, 1, 0, 1}}, + /* Both same */ + {{ 4, 5, 6, 3}, { 3, 7, 11, 15}, {16+1, 16+3, 16+4+1, 16+4+3}, {0, 1, 2, 3}, {0, 0, 1, 1}} +}; + +static void ALWAYS_INLINE x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y, int b_interlaced ) +{ + const int mb_interlaced = b_interlaced && MB_INTERLACED; + int top_y = mb_y - (1 << mb_interlaced); + int top = top_y * h->mb.i_mb_stride + mb_x; h->mb.i_mb_x = mb_x; h->mb.i_mb_y = mb_y; h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x); h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x); + h->mb.left_b8[0] = + h->mb.left_b8[1] = -1; + h->mb.left_b4[0] = + h->mb.left_b4[1] = -1; h->mb.i_neighbour = 0; h->mb.i_neighbour_intra = 0; h->mb.i_neighbour_frame = 0; h->mb.i_mb_top_xy = -1; - h->mb.i_mb_left_xy = -1; + h->mb.i_mb_top_y = -1; + h->mb.i_mb_left_xy[0] = h->mb.i_mb_left_xy[1] = -1; h->mb.i_mb_topleft_xy = -1; h->mb.i_mb_topright_xy = -1; h->mb.i_mb_type_top = -1; - h->mb.i_mb_type_left = -1; + h->mb.i_mb_type_left[0] = h->mb.i_mb_type_left[1] = -1; h->mb.i_mb_type_topleft = -1; h->mb.i_mb_type_topright = -1; + h->mb.left_index_table = &left_indices[3]; + h->mb.topleft_partition = 0; + + int topleft_y = top_y; + int topright_y = top_y; + int left[2]; + + left[0] = left[1] = h->mb.i_mb_xy - 1; + h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2; + h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4; + + if( b_interlaced ) + { + h->mb.i_mb_top_mbpair_xy = h->mb.i_mb_xy - 2*h->mb.i_mb_stride; + h->mb.i_mb_topleft_y = -1; + h->mb.i_mb_topright_y = -1; + + if( mb_y&1 ) + { + if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) + { + left[0] = left[1] = h->mb.i_mb_xy - 1 - h->mb.i_mb_stride; + h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2 - 2*h->mb.i_b8_stride; + h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4 - 4*h->mb.i_b4_stride; + + if( mb_interlaced ) + { + h->mb.left_index_table = &left_indices[2]; + left[1] += h->mb.i_mb_stride; + h->mb.left_b8[1] += 2*h->mb.i_b8_stride; + h->mb.left_b4[1] += 4*h->mb.i_b4_stride; + } + else + { + h->mb.left_index_table = &left_indices[1]; + topleft_y++; + h->mb.topleft_partition = 1; + } + } + if( !mb_interlaced ) + topright_y = -1; + } + else + { + if( mb_interlaced && top >= 0 ) + { + if( !h->mb.field[top] ) + { + top += h->mb.i_mb_stride; + top_y++; + } + if( mb_x ) + topleft_y += !h->mb.field[h->mb.i_mb_stride*topleft_y + mb_x - 1]; + if( mb_x < h->mb.i_mb_width-1 ) + topright_y += !h->mb.field[h->mb.i_mb_stride*topright_y + mb_x + 1]; + } + if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) + { + if( mb_interlaced ) + { + h->mb.left_index_table = &left_indices[2]; + left[1] += h->mb.i_mb_stride; + h->mb.left_b8[1] += 2*h->mb.i_b8_stride; + h->mb.left_b4[1] += 4*h->mb.i_b4_stride; + } + else + h->mb.left_index_table = &left_indices[0]; + } + } + } if( mb_x > 0 ) { h->mb.i_neighbour_frame |= MB_LEFT; - h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1; - h->mb.i_mb_type_left = h->mb.type[h->mb.i_mb_left_xy]; - if( h->mb.i_mb_xy > h->sh.i_first_mb ) + h->mb.i_mb_left_xy[0] = left[0]; + h->mb.i_mb_left_xy[1] = left[1]; + h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]]; + h->mb.i_mb_type_left[1] = h->mb.type[h->mb.i_mb_left_xy[1]]; + if( h->mb.slice_table[left[0]] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_LEFT; - if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left ) ) + // FIXME: We don't currently support constrained intra + mbaff. + if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left[0] ) ) h->mb.i_neighbour_intra |= MB_LEFT; } } /* We can't predict from the previous threadslice since it hasn't been encoded yet. */ - if( (h->i_threadslice_start >> h->mb.b_interlaced) != (mb_y >> h->mb.b_interlaced) ) + if( (h->i_threadslice_start >> mb_interlaced) != (mb_y >> mb_interlaced) ) { if( top >= 0 ) { h->mb.i_neighbour_frame |= MB_TOP; h->mb.i_mb_top_xy = top; + h->mb.i_mb_top_y = top_y; h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy]; - if( top >= h->sh.i_first_mb ) + if( h->mb.slice_table[top] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_TOP; @@ -611,12 +742,13 @@ } } - if( mb_x > 0 && top - 1 >= 0 ) + if( mb_x > 0 && topleft_y >= 0 ) { h->mb.i_neighbour_frame |= MB_TOPLEFT; - h->mb.i_mb_topleft_xy = top - 1; + h->mb.i_mb_topleft_xy = h->mb.i_mb_stride*topleft_y + mb_x - 1; + h->mb.i_mb_topleft_y = topleft_y; h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topleft_xy]; - if( top - 1 >= h->sh.i_first_mb ) + if( h->mb.slice_table[h->mb.i_mb_topleft_xy] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_TOPLEFT; @@ -625,12 +757,13 @@ } } - if( mb_x < h->mb.i_mb_width - 1 && top + 1 >= 0 ) + if( mb_x < h->mb.i_mb_width - 1 && topright_y >= 0 ) { h->mb.i_neighbour_frame |= MB_TOPRIGHT; - h->mb.i_mb_topright_xy = top + 1; + h->mb.i_mb_topright_xy = h->mb.i_mb_stride*topright_y + mb_x + 1; + h->mb.i_mb_topright_y = topright_y; h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topright_xy]; - if( top + 1 >= h->sh.i_first_mb ) + if( h->mb.slice_table[h->mb.i_mb_topright_xy] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_TOPRIGHT; @@ -641,13 +774,20 @@ } } -void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) +#define LTOP 0 +#if HAVE_INTERLACED +# define LBOT 1 +#else +# define LBOT 0 +#endif + +void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, int b_mbaff ) { - x264_macroblock_cache_load_neighbours( h, mb_x, mb_y ); + x264_macroblock_cache_load_neighbours( h, mb_x, mb_y, b_mbaff ); - int left = h->mb.i_mb_left_xy; + int *left = h->mb.i_mb_left_xy; int top = h->mb.i_mb_top_xy; - int top_y = mb_y - (1 << h->mb.b_interlaced); + int top_y = h->mb.i_mb_top_y; int s8x8 = h->mb.i_b8_stride; int s4x4 = h->mb.i_b4_stride; int top_8x8 = (2*top_y+1) * s8x8 + 2*mb_x; @@ -660,6 +800,8 @@ uint8_t (*nnz)[24] = h->mb.non_zero_count; int16_t *cbp = h->mb.cbp; + const x264_left_table_t *left_index_table = h->mb.left_index_table; + /* load cache */ if( h->mb.i_neighbour & MB_TOP ) { @@ -699,25 +841,53 @@ if( h->mb.i_neighbour & MB_LEFT ) { - h->mb.cache.i_cbp_left = cbp[left]; + if( b_mbaff ) + { + const int16_t top_luma = (cbp[left[LTOP]] >> (left_index_table->mv[0]&(~1))) & 2; + const int16_t bot_luma = (cbp[left[LBOT]] >> (left_index_table->mv[2]&(~1))) & 2; + h->mb.cache.i_cbp_left = (cbp[left[LTOP]] & 0xfff0) | (bot_luma<<2) | top_luma; + } + else + h->mb.cache.i_cbp_left = cbp[left[0]]; + if( b_mbaff ) + { + /* load intra4x4 */ + h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left[LTOP]][left_index_table->intra[0]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left[LTOP]][left_index_table->intra[1]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left[LBOT]][left_index_table->intra[2]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left[LBOT]][left_index_table->intra[3]]; + + /* load non_zero_count */ + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[LTOP]][left_index_table->nnz[0]]; + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[LTOP]][left_index_table->nnz[1]]; + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[LBOT]][left_index_table->nnz[2]]; + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[LBOT]][left_index_table->nnz[3]]; - /* load intra4x4 */ - h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3]; + h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left[LTOP]][left_index_table->nnz_chroma[0]]; + h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left[LBOT]][left_index_table->nnz_chroma[1]]; - /* load non_zero_count */ - h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3]; - h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7]; - h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11]; - h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15]; + h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left[LTOP]][left_index_table->nnz_chroma[2]]; + h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left[LBOT]][left_index_table->nnz_chroma[3]]; + } + else + { + int l = left[0]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[l][left_index_table->intra[0]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[l][left_index_table->intra[1]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[l][left_index_table->intra[2]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[l][left_index_table->intra[3]]; + + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[l][left_index_table->nnz[0]]; + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[l][left_index_table->nnz[1]]; + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[l][left_index_table->nnz[2]]; + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[l][left_index_table->nnz[3]]; - h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1]; - h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3]; + h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[l][left_index_table->nnz_chroma[0]]; + h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[l][left_index_table->nnz_chroma[1]]; - h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1]; - h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3]; + h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[l][left_index_table->nnz_chroma[2]]; + h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[l][left_index_table->nnz_chroma[3]]; + } } else { @@ -742,20 +912,17 @@ if( h->pps->b_transform_8x8_mode ) { h->mb.cache.i_neighbour_transform_size = - ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] ) + ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left[0]] ) + ( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ); } - if( h->sh.b_mbaff ) + if( b_mbaff ) { - h->mb.pic.i_fref[0] = h->i_ref[0] << h->mb.b_interlaced; - h->mb.pic.i_fref[1] = h->i_ref[1] << h->mb.b_interlaced; - h->mb.cache.i_neighbour_interlaced = - !!(h->mb.i_neighbour & MB_LEFT) - + !!(h->mb.i_neighbour & MB_TOP); + h->mb.pic.i_fref[0] = h->i_ref[0] << MB_INTERLACED; + h->mb.pic.i_fref[1] = h->i_ref[1] << MB_INTERLACED; } - if( !h->mb.b_interlaced ) + if( !b_mbaff ) { x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE ); @@ -789,8 +956,17 @@ int i8 = x264_scan8[0] - 1 - 1*8; if( h->mb.i_neighbour & MB_TOPLEFT ) { - h->mb.cache.ref[l][i8] = ref[top_8x8 - 1]; - CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] ); + int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topleft_y + mb_x-1)+1+s8x8 : top_8x8 - 1; + int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topleft_y + mb_x-1)+3+3*s4x4 : top_4x4 - 1; + if( b_mbaff && h->mb.topleft_partition ) + { + /* Take motion vector from the middle of macroblock instead of + * the bottom right as usual. */ + iv -= 2*s4x4; + ir -= s8x8; + } + h->mb.cache.ref[l][i8] = ref[ir]; + CP32( h->mb.cache.mv[l][i8], mv[iv] ); } else { @@ -816,8 +992,10 @@ i8 = x264_scan8[0] + 4 - 1*8; if( h->mb.i_neighbour & MB_TOPRIGHT ) { - h->mb.cache.ref[l][i8] = ref[top_8x8 + 2]; - CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] ); + int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topright_y + (mb_x+1))+s8x8 : top_8x8 + 2; + int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topright_y + (mb_x+1))+3*s4x4 : top_4x4 + 4; + h->mb.cache.ref[l][i8] = ref[ir]; + CP32( h->mb.cache.mv[l][i8], mv[iv] ); } else h->mb.cache.ref[l][i8] = -2; @@ -825,17 +1003,32 @@ i8 = x264_scan8[0] - 1; if( h->mb.i_neighbour & MB_LEFT ) { - const int ir = h->mb.i_b8_xy - 1; - const int iv = h->mb.i_b4_xy - 1; - h->mb.cache.ref[l][i8+0*8] = - h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8]; - h->mb.cache.ref[l][i8+2*8] = - h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8]; - - CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] ); - CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] ); - CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] ); - CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] ); + if( b_mbaff ) + { + h->mb.cache.ref[l][i8+0*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[0]]; + h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[1]]; + h->mb.cache.ref[l][i8+2*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[2]]; + h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[3]]; + + CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[0]] ); + CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[1]] ); + CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[2]] ); + CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[3]] ); + } + else + { + const int ir = h->mb.i_b8_xy - 1; + const int iv = h->mb.i_b4_xy - 1; + h->mb.cache.ref[l][i8+0*8] = + h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8]; + h->mb.cache.ref[l][i8+2*8] = + h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8]; + + CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] ); + CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] ); + CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] ); + CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] ); + } } else { @@ -846,6 +1039,39 @@ } } + /* Extra logic for top right mv in mbaff. + * . . . d . . a . + * . . . e . . . . + * . . . f b . c . + * . . . . . . . . + * + * If the top right of the 4x4 partitions labeled a, b and c in the + * above diagram do not exist, but the entries d, e and f exist (in + * the macroblock to the left) then use those instead. + */ + if( b_mbaff && (h->mb.i_neighbour & MB_LEFT) ) + { + if( MB_INTERLACED && !h->mb.field[h->mb.i_mb_xy-1] ) + { + h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*0]; + h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*1]; + h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[1] + 1 + s8x8*0]; + CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[0]+1)] ); + CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[1]+1)] ); + CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[1] + 3 + s4x4*(left_index_table->mv[2]+1)] ); + } + else if( !MB_INTERLACED && h->mb.field[h->mb.i_mb_xy-1] ) + { + // Looking at the bottom field so always take the bottom macroblock of the pair. + h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]]; + h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]]; + h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]]; + CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] ); + CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] ); + CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[2]] ); + } + } + if( h->param.b_cabac ) { uint8_t (*mvd)[8][2] = h->mb.mvd[l]; @@ -854,31 +1080,169 @@ else M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0; + if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1] >= 0) ) + { + CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[LTOP]][left_index_table->intra[0]] ); + CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[LTOP]][left_index_table->intra[1]] ); + } + else + { + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+0*8] ) = 0; + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+1*8] ) = 0; + } + if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1+2*8] >=0) ) + { + CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[LBOT]][left_index_table->intra[2]] ); + CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[LBOT]][left_index_table->intra[3]] ); + } + else + { + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+2*8] ) = 0; + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+3*8] ) = 0; + } + } + + /* If motion vectors are cached from frame macroblocks but this + * macroblock is a field macroblock then the motion vector must be + * halved. Similarly, motion vectors from field macroblocks are doubled. */ + if( b_mbaff ) + { +#define MAP_MVS\ + if( FIELD_DIFFERENT(h->mb.i_mb_topleft_xy) )\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 - 1*8)\ + if( FIELD_DIFFERENT(top) )\ + {\ + MAP_F2F(mv, ref, x264_scan8[0] + 0 - 1*8)\ + MAP_F2F(mv, ref, x264_scan8[0] + 1 - 1*8)\ + MAP_F2F(mv, ref, x264_scan8[0] + 2 - 1*8)\ + MAP_F2F(mv, ref, x264_scan8[0] + 3 - 1*8)\ + }\ + if( FIELD_DIFFERENT(h->mb.i_mb_topright_xy) )\ + MAP_F2F(mv, ref, x264_scan8[0] + 4 - 1*8)\ + if( FIELD_DIFFERENT(left[0]) )\ + {\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 0*8)\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 1*8)\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 2*8)\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 3*8)\ + MAP_F2F(topright_mv, topright_ref, 0)\ + MAP_F2F(topright_mv, topright_ref, 1)\ + MAP_F2F(topright_mv, topright_ref, 2)\ + } + + if( MB_INTERLACED ) + { +#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && !h->mb.field[macroblock]) +#define MAP_F2F(varmv, varref, index)\ + if( h->mb.cache.varref[l][index] >= 0 )\ + {\ + h->mb.cache.varref[l][index] <<= 1;\ + h->mb.cache.varmv[l][index][1] /= 2;\ + h->mb.cache.mvd[l][index][1] >>= 1;\ + } + MAP_MVS +#undef MAP_F2F +#undef FIELD_DIFFERENT + } + else + { +#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && h->mb.field[macroblock]) +#define MAP_F2F(varmv, varref, index)\ + if( h->mb.cache.varref[l][index] >= 0 )\ + {\ + h->mb.cache.varref[l][index] >>= 1;\ + h->mb.cache.varmv[l][index][1] <<= 1;\ + h->mb.cache.mvd[l][index][1] <<= 1;\ + } + MAP_MVS +#undef MAP_F2F +#undef FIELD_DIFFERENT + } + } + } + + if( b_mbaff && mb_x == 0 && !(mb_y&1) && mb_y > 0 ) + h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_xy - h->mb.i_mb_stride]; + + /* Check whether skip here would cause decoder to predict interlace mode incorrectly. + * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */ + h->mb.b_allow_skip = 1; + if( b_mbaff ) + { + if( MB_INTERLACED != h->mb.field_decoding_flag && + h->mb.i_mb_prev_xy >= 0 && IS_SKIP(h->mb.type[h->mb.i_mb_prev_xy]) ) + h->mb.b_allow_skip = 0; + if( (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) ) + { if( h->mb.i_neighbour & MB_LEFT ) { - CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] ); - CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] ); - CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] ); - CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] ); + if( h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.b_allow_skip = 0; + } + else if( h->mb.i_neighbour & MB_TOP ) + { + if( h->mb.field[h->mb.i_mb_top_xy] != MB_INTERLACED ) + h->mb.b_allow_skip = 0; + } + else // Frame mb pair is predicted + { + if( MB_INTERLACED ) + h->mb.b_allow_skip = 0; + } + } + } + + if( h->param.b_cabac ) + { + if( b_mbaff ) + { + int left_xy, top_xy; + /* Neighbours here are calculated based on field_decoding_flag */ + int mb_xy = mb_x + (mb_y&~1)*h->mb.i_mb_stride; + left_xy = mb_xy - 1; + if( (mb_y&1) && mb_x > 0 && h->mb.field_decoding_flag == h->mb.field[left_xy] ) + left_xy += h->mb.i_mb_stride; + if( h->mb.field_decoding_flag ) + { + top_xy = mb_xy - h->mb.i_mb_stride; + if( !(mb_y&1) && top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && h->mb.field[top_xy] ) + top_xy -= h->mb.i_mb_stride; } else - for( int i = 0; i < 4; i++ ) - M16( h->mb.cache.mvd[l][x264_scan8[0]-1+i*8] ) = 0; + top_xy = mb_x + (mb_y-1)*h->mb.i_mb_stride; + + h->mb.cache.i_neighbour_skip = (mb_x > 0 && h->mb.slice_table[left_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[left_xy] )) + + (top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[top_xy] )); + } + else + { + h->mb.cache.i_neighbour_skip = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] )) + + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top )); } } /* load skip */ if( h->sh.i_type == SLICE_TYPE_B ) { - h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(mb_y&1)]; - h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(mb_y&1)]; + h->mb.bipred_weight = h->mb.bipred_weight_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; + h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; if( h->param.b_cabac ) { uint8_t skipbp; x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 ); - skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left] : 0; - h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2; - h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8; + if( b_mbaff ) + { + skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LTOP]] : 0; + h->mb.cache.skip[x264_scan8[0] - 1] = (skipbp >> (1+(left_index_table->mv[0]&~1))) & 1; + skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LBOT]] : 0; + h->mb.cache.skip[x264_scan8[8] - 1] = (skipbp >> (1+(left_index_table->mv[2]&~1))) & 1; + } + else + { + skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0; + h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2; + h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8; + } skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0; h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4; h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8; @@ -902,36 +1266,67 @@ | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0); } +void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y ) +{ + x264_macroblock_cache_load( h, mb_x, mb_y, 0 ); +} + +void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y ) +{ + x264_macroblock_cache_load( h, mb_x, mb_y, 1 ); +} + void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y ) { int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2; h->mb.i_neighbour = 0; h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; - - if( mb_x > 0 ) + h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy]; + h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED); + h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y; + h->mb.i_mb_left_xy[1] = + h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1; + if( SLICE_MBAFF ) { - h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1; - if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_left_xy] == h->mb.slice_table[h->mb.i_mb_xy] ) - h->mb.i_neighbour |= MB_LEFT; + if( mb_y&1 ) + { + if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride; + } + else + { + if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] ) + { + h->mb.i_mb_top_xy += h->mb.i_mb_stride; + h->mb.i_mb_top_y++; + } + if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride; + } } - if( mb_y > h->mb.b_interlaced ) - { - h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced); - if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] ) - h->mb.i_neighbour |= MB_TOP; - } + if( mb_x > 0 && (deblock_on_slice_edges || + h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) ) + h->mb.i_neighbour |= MB_LEFT; + if( mb_y > MB_INTERLACED && (deblock_on_slice_edges + || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) ) + h->mb.i_neighbour |= MB_TOP; } -void x264_macroblock_cache_load_deblock( x264_t *h ) +void x264_macroblock_deblock_strength( x264_t *h ) { + uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) ) - return; + { + memset( bs[0], 3, 4*4*sizeof(uint8_t) ); + memset( bs[1], 3, 4*4*sizeof(uint8_t) ); + if( !SLICE_MBAFF ) return; + } /* If we have multiple slices and we're deblocking on slice edges, we * have to reload neighbour data. */ - if( h->sh.i_first_mb && h->sh.i_disable_deblocking_filter_idc != 2 ) + if( SLICE_MBAFF || (h->sh.i_first_mb && h->sh.i_disable_deblocking_filter_idc != 2) ) { int old_neighbour = h->mb.i_neighbour; int mb_x = h->mb.i_mb_x; @@ -941,24 +1336,25 @@ h->mb.i_neighbour &= ~old_neighbour; if( h->mb.i_neighbour ) { - int top_y = mb_y - (1 << h->mb.b_interlaced); + int top_y = h->mb.i_mb_top_y; int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x; int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x; int s8x8 = h->mb.i_b8_stride; int s4x4 = h->mb.i_b4_stride; uint8_t (*nnz)[24] = h->mb.non_zero_count; + const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3]; if( h->mb.i_neighbour & MB_TOP ) CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] ); if( h->mb.i_neighbour & MB_LEFT ) { - int left = h->mb.i_mb_left_xy; - h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3]; - h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7]; - h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11]; - h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15]; + int *left = h->mb.i_mb_left_xy; + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]]; + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]]; + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]]; + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]]; } for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) @@ -979,17 +1375,15 @@ i8 = x264_scan8[0] - 1; if( h->mb.i_neighbour & MB_LEFT ) { - int ir = h->mb.i_b8_xy - 1; - int iv = h->mb.i_b4_xy - 1; h->mb.cache.ref[l][i8+0*8] = - h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8]; + h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]]; h->mb.cache.ref[l][i8+2*8] = - h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8]; + h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]]; - CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] ); - CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] ); - CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] ); - CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] ); + CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] ); + CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] ); + CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] ); + CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] ); } } } @@ -1029,7 +1423,7 @@ { uint8_t (*nnz)[24] = h->mb.non_zero_count; int top = h->mb.i_mb_top_xy; - int left = h->mb.i_mb_left_xy; + int *left = h->mb.i_mb_left_xy; if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ) { @@ -1040,15 +1434,21 @@ M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0; } - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] ) + if( h->mb.i_neighbour & MB_LEFT ) { int i8 = x264_scan8[0] - 1; - int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] ); - int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] ); - h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0; - h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0; - h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1; - h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1; + if( h->mb.mb_transform_size[left[0]] ) + { + int nnz_left0 = M16( &nnz[left[0]][2] ) | M16( &nnz[left[0]][6] ); + h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0; + h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0; + } + if( h->mb.mb_transform_size[left[1]] ) + { + int nnz_left1 = M16( &nnz[left[1]][10] ) | M16( &nnz[left[1]][14] ); + h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1; + h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1; + } } if( h->mb.mb_transform_size[h->mb.i_mb_xy] ) @@ -1066,43 +1466,55 @@ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot; } } -} -static void ALWAYS_INLINE twiddle_topleft_pixel( pixel *dst, pixel *src, int b_interlaced ) -{ - // We update intra_border_backup in-place, so the topleft neighbor will no longer - // exist there when load_pic_pointers wants it. Move it within p_fdec instead. - if( b_interlaced ) - { - dst[0] = dst[-1]; - dst[-1] = src[0]; - } - else - dst[0] = src[0]; + int mvy_limit = 4 >> MB_INTERLACED; + h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, + bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B, h ); } -static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_interlaced ) +static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_mbaff ) { int w = i ? 8 : 16; int i_stride = h->fdec->i_stride[i]; - int i_stride2 = i_stride << b_interlaced; - int i_pix_offset = b_interlaced + int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED); + int i_pix_offset = (b_mbaff && MB_INTERLACED) ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride : 16 * mb_x + w * mb_y * i_stride; - pixel *intra_fdec = &h->intra_border_backup[mb_y&1][i][mb_x*16]; if( i ) - { h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] ); - memcpy( intra_fdec, h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) ); - memcpy( intra_fdec+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) ); - twiddle_topleft_pixel( h->mb.pic.p_fdec[1]-FDEC_STRIDE-1, h->mb.pic.p_fdec[1]-FDEC_STRIDE+7, b_interlaced ); - twiddle_topleft_pixel( h->mb.pic.p_fdec[2]-FDEC_STRIDE-1, h->mb.pic.p_fdec[2]-FDEC_STRIDE+7, b_interlaced ); + else + h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); +} + +static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int mb_y, int b_mbaff ) +{ + /* In MBAFF we store the last two rows in intra_border_backup[0] and [1]. + * For progressive mbs this is the bottom two rows, and for interlaced the + * bottom row of each field. We also store samples needed for the next + * mbpair in intra_border_backup[2]. */ + int backup_dst = !b_mbaff ? 0 : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2; + memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) ); + memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) ); + memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) ); + if( b_mbaff ) + { + if( mb_y&1 ) + { + int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE; + backup_dst = MB_INTERLACED ? 2 : 0; + memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+backup_src, 16*sizeof(pixel) ); + backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE; + memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) ); + memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) ); + } } else { - h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); - memcpy( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) ); - twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, b_interlaced ); + /* In progressive we update intra_border_backup in-place, so the topleft neighbor will + * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */ + h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15]; + h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7]; + h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7]; } } @@ -1120,13 +1532,15 @@ int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy]; uint8_t *nnz = h->mb.non_zero_count[i_mb_xy]; - if( h->mb.b_interlaced ) + if( SLICE_MBAFF ) { + x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 1 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1 ); } else { + x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0 ); } @@ -1285,42 +1699,43 @@ void x264_macroblock_bipred_init( x264_t *h ) { - for( int field = 0; field <= h->sh.b_mbaff; field++ ) - for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<h->sh.b_mbaff); i_ref0++ ) - { - x264_frame_t *l0 = h->fref[0][i_ref0>>h->sh.b_mbaff]; - int poc0 = l0->i_poc + l0->i_delta_poc[field^(i_ref0&1)]; - for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<h->sh.b_mbaff); i_ref1++ ) - { - int dist_scale_factor; - x264_frame_t *l1 = h->fref[1][i_ref1>>h->sh.b_mbaff]; - int poc1 = l1->i_poc + l1->i_delta_poc[field^(i_ref1&1)]; - int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; - int td = x264_clip3( poc1 - poc0, -128, 127 ); - if( td == 0 /* || pic0 is a long-term ref */ ) - dist_scale_factor = 256; - else - { - int tb = x264_clip3( cur_poc - poc0, -128, 127 ); - int tx = (16384 + (abs(td) >> 1)) / td; - dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 ); - } - - h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor; - - dist_scale_factor >>= 2; - if( h->param.analyse.b_weighted_bipred - && dist_scale_factor >= -64 - && dist_scale_factor <= 128 ) + for( int mbfield = 0; mbfield <= SLICE_MBAFF; mbfield++ ) + for( int field = 0; field <= SLICE_MBAFF; field++ ) + for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<mbfield); i_ref0++ ) + { + x264_frame_t *l0 = h->fref[0][i_ref0>>mbfield]; + int poc0 = l0->i_poc + mbfield*l0->i_delta_poc[field^(i_ref0&1)]; + for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<mbfield); i_ref1++ ) { - h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 64 - dist_scale_factor; - // ssse3 implementation of biweight doesn't support the extrema. - // if we ever generate them, we'll have to drop that optimization. - assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); + int dist_scale_factor; + x264_frame_t *l1 = h->fref[1][i_ref1>>mbfield]; + int cur_poc = h->fdec->i_poc + mbfield*h->fdec->i_delta_poc[field]; + int poc1 = l1->i_poc + mbfield*l1->i_delta_poc[field^(i_ref1&1)]; + int td = x264_clip3( poc1 - poc0, -128, 127 ); + if( td == 0 /* || pic0 is a long-term ref */ ) + dist_scale_factor = 256; + else + { + int tb = x264_clip3( cur_poc - poc0, -128, 127 ); + int tx = (16384 + (abs(td) >> 1)) / td; + dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 ); + } + + h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = dist_scale_factor; + + dist_scale_factor >>= 2; + if( h->param.analyse.b_weighted_bipred + && dist_scale_factor >= -64 + && dist_scale_factor <= 128 ) + { + h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 64 - dist_scale_factor; + // ssse3 implementation of biweight doesn't support the extrema. + // if we ever generate them, we'll have to drop that optimization. + assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); + } + else + h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32; } - else - h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 32; } - } }

@@ -290,8 +290,10 @@ void x264_macroblock_slice_init( x264_t *h ); void x264_macroblock_thread_init( x264_t *h ); -void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ); -void x264_macroblock_cache_load_deblock( x264_t *h ); +void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, int b_interlaced ); +void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y ); +void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y ); +void x264_macroblock_deblock_strength( x264_t *h ); void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y ); void x264_macroblock_cache_save( x264_t *h );

@@ -511,18 +511,17 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) { - const int b_interlaced = h->sh.b_mbaff; - const int stride = frame->i_stride[0] << b_interlaced; + const int b_interlaced = PARAM_INTERLACED; + int stride = frame->i_stride[0]; const int width = frame->i_width[0]; - int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 - int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8; + int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 + int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8; int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd if( mb_y & b_interlaced ) return; - for( int y = 0; y <= b_interlaced; y++, offs += frame->i_stride[0] ) - { + if( !b_interlaced || h->mb.b_adaptive_mbaff ) h->mc.hpel_filter( frame->filtered[1] + offs, frame->filtered[2] + offs, @@ -530,6 +529,24 @@ frame->plane[0] + offs, stride, width + 16, height - start, h->scratch_buffer ); + + if( b_interlaced ) + { + /* MC must happen between pixels in the same field. */ + stride = frame->i_stride[0] << 1; + start = (mb_y*16 >> 1) - 8; + int height_fld = ((b_end ? frame->i_lines[0] : mb_y*16) >> 1) + 8; + offs = start*stride - 8; + for( int i = 0; i < 2; i++, offs += frame->i_stride[0] ) + { + h->mc.hpel_filter( + frame->filtered_fld[1] + offs, + frame->filtered_fld[2] + offs, + frame->filtered_fld[3] + offs, + frame->plane_fld[0] + offs, + stride, width + 16, height_fld - start, + h->scratch_buffer ); + } } /* generate integral image:

@@ -38,12 +38,33 @@ int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width]; int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width]; + // Partitions not yet reached in scan order are unavailable. if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 ) { i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1]; mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1]; - } + if( SLICE_MBAFF + && h->mb.cache.ref[i_list][x264_scan8[0]-1] != -2 + && MB_INTERLACED != h->mb.field[h->mb.i_mb_left_xy[0]] ) + { + if( idx == 2 ) + { + mv_c = h->mb.cache.topright_mv[i_list][0]; + i_refc = h->mb.cache.topright_ref[i_list][0]; + } + else if( idx == 8 ) + { + mv_c = h->mb.cache.topright_mv[i_list][1]; + i_refc = h->mb.cache.topright_ref[i_list][1]; + } + else if( idx == 10 ) + { + mv_c = h->mb.cache.topright_mv[i_list][2]; + i_refc = h->mb.cache.topright_ref[i_list][2]; + } + } + } if( h->mb.i_partition == D_16x8 ) { if( idx == 0 ) @@ -161,50 +182,95 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h ) { - int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x; - int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x; - const int type_col = h->fref[1][0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref[1][0]->mb_partition[h->mb.i_mb_xy]; + int mb_x = h->mb.i_mb_x; + int mb_y = h->mb.i_mb_y; + int mb_xy = h->mb.i_mb_xy; + int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] }; + int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] }; + int preshift = MB_INTERLACED; + int postshift = MB_INTERLACED; + int offset = 1; + int yshift = 1; + h->mb.i_partition = partition_col[0]; + if( PARAM_INTERLACED && h->fref[1][0]->field[mb_xy] != MB_INTERLACED ) + { + if( MB_INTERLACED ) + { + mb_y = h->mb.i_mb_y&~1; + mb_xy = mb_x + h->mb.i_mb_stride * mb_y; + type_col[0] = h->fref[1][0]->mb_type[mb_xy]; + type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride]; + partition_col[0] = h->fref[1][0]->mb_partition[mb_xy]; + partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride]; + preshift = 0; + yshift = 0; + + if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) && + (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) && + partition_col[0] != D_8x8 ) + h->mb.i_partition = D_16x8; + else + h->mb.i_partition = D_8x8; + } + else + { + int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1]; + int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc) + >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc); + mb_y = (h->mb.i_mb_y&~1) + col_parity; + mb_xy = mb_x + h->mb.i_mb_stride * mb_y; + type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy]; + partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy]; + preshift = 1; + yshift = 2; + h->mb.i_partition = partition_col[0]; + } + offset = 0; + } + int i_mb_4x4 = 16 * h->mb.i_mb_stride * mb_y + 4 * mb_x; + int i_mb_8x8 = 4 * h->mb.i_mb_stride * mb_y + 2 * mb_x; x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - h->mb.i_partition = partition_col; - - if( IS_INTRA( type_col ) ) - { - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 ); - return 1; - } - /* Don't do any checks other than the ones we have to, based * on the size of the colocated partitions. * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); - + int max_i8 = (D_16x16 - h->mb.i_partition) + 1; + int step = (h->mb.i_partition == D_16x8) + 1; + int width = 4 >> ((D_16x16 - h->mb.i_partition)&1); + int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1); for( int i8 = 0; i8 < max_i8; i8 += step ) { int x8 = i8&1; int y8 = i8>>1; - int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride; + int ypart = (SLICE_MBAFF && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ? + MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 : + 3*y8; + + if( IS_INTRA( type_col[y8] ) ) + { + x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, 0 ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 ); + continue; + } + + int i_part_8x8 = i_mb_8x8 + x8 + (ypart>>1) * h->mb.i_b8_stride; int i_ref1_ref = h->fref[1][0]->ref[0][i_part_8x8]; - int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff); + int i_ref = (map_col_to_list0(i_ref1_ref>>preshift) << postshift) + (offset&i_ref1_ref&MB_INTERLACED); if( i_ref >= 0 ) { int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0]; - int16_t *mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride]; + int16_t *mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3*x8 + ypart * h->mb.i_b4_stride]; + int16_t mv_y = (mv_col[1]<<yshift)/2; int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; - int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; - if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) ) + int l0y = ( dist_scale_factor * mv_y + 128 ) >> 8; + if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_y > h->mb.mv_max_spel[1]) ) return 0; x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref ); x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) ); - x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_y) ); } else { @@ -220,19 +286,10 @@ return 1; } -static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) +static ALWAYS_INLINE int x264_mb_predict_mv_direct16x16_spatial( x264_t *h, int b_interlaced ) { int8_t ref[2]; ALIGNED_ARRAY_8( int16_t, mv,[2],[2] ); - const int8_t *l1ref0 = &h->fref[1][0]->ref[0][h->mb.i_b8_xy]; - const int8_t *l1ref1 = &h->fref[1][0]->ref[1][h->mb.i_b8_xy]; - const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref[1][0]->mv[0][h->mb.i_b4_xy], - (const int16_t (*)[2]) &h->fref[1][0]->mv[1][h->mb.i_b4_xy] }; - const int type_col = h->fref[1][0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref[1][0]->mb_partition[h->mb.i_mb_xy]; - - h->mb.i_partition = partition_col; - for( int i_list = 0; i_list < 2; i_list++ ) { int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; @@ -277,6 +334,50 @@ ref[i_list] = i_ref; } + int mb_x = h->mb.i_mb_x; + int mb_y = h->mb.i_mb_y; + int mb_xy = h->mb.i_mb_xy; + int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] }; + int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] }; + h->mb.i_partition = partition_col[0]; + if( b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED ) + { + if( MB_INTERLACED ) + { + mb_y = h->mb.i_mb_y&~1; + mb_xy = mb_x + h->mb.i_mb_stride * mb_y; + type_col[0] = h->fref[1][0]->mb_type[mb_xy]; + type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride]; + partition_col[0] = h->fref[1][0]->mb_partition[mb_xy]; + partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride]; + + if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) && + (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) && + partition_col[0] != D_8x8 ) + h->mb.i_partition = D_16x8; + else + h->mb.i_partition = D_8x8; + } + else + { + int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1]; + int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc) + >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc); + mb_y = (h->mb.i_mb_y&~1) + col_parity; + mb_xy = mb_x + h->mb.i_mb_stride * mb_y; + type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy]; + partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy]; + h->mb.i_partition = partition_col[0]; + } + } + int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy ; + int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy ; + + int8_t *l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8]; + int8_t *l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8]; + int16_t (*l1mv[2])[2] = { (int16_t (*)[2]) &h->fref[1][0]->mv[0][i_mb_4x4], + (int16_t (*)[2]) &h->fref[1][0]->mv[1][i_mb_4x4] }; + if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */ { x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); @@ -296,24 +397,31 @@ return 0; } - if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) ) + if( !M64( mv ) || (!b_interlaced && IS_INTRA( type_col[0] )) || (ref[0]&&ref[1]) ) return 1; /* Don't do any checks other than the ones we have to, based * on the size of the colocated partitions. * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); + int max_i8 = (D_16x16 - h->mb.i_partition) + 1; + int step = (h->mb.i_partition == D_16x8) + 1; + int width = 4 >> ((D_16x16 - h->mb.i_partition)&1); + int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1); /* col_zero_flag */ for( int i8 = 0; i8 < max_i8; i8 += step ) { const int x8 = i8&1; const int y8 = i8>>1; - const int o8 = x8 + y8 * h->mb.i_b8_stride; - const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride); + int ypart = (b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ? + MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 : + 3*y8; + int o8 = x8 + (ypart>>1) * h->mb.i_b8_stride; + int o4 = 3*x8 + ypart * h->mb.i_b4_stride; + + if( b_interlaced && IS_INTRA( type_col[y8] ) ) + continue; + int idx; if( l1ref0[o8] == 0 ) idx = 0; @@ -332,13 +440,29 @@ return 1; } + +static int x264_mb_predict_mv_direct16x16_spatial_interlaced( x264_t *h ) +{ + return x264_mb_predict_mv_direct16x16_spatial( h, 1 ); +} + +static int x264_mb_predict_mv_direct16x16_spatial_progressive( x264_t *h ) +{ + return x264_mb_predict_mv_direct16x16_spatial( h, 0 ); +} + int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed ) { int b_available; if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE ) return 0; else if( h->sh.b_direct_spatial_mv_pred ) - b_available = x264_mb_predict_mv_direct16x16_spatial( h ); + { + if( SLICE_MBAFF ) + b_available = x264_mb_predict_mv_direct16x16_spatial_interlaced( h ); + else + b_available = x264_mb_predict_mv_direct16x16_spatial_progressive( h ); + } else b_available = x264_mb_predict_mv_direct16x16_temporal( h ); @@ -426,7 +550,7 @@ } /* spatial predictors */ - SET_MVP( mvr[h->mb.i_mb_left_xy] ); + SET_MVP( mvr[h->mb.i_mb_left_xy[0]] ); SET_MVP( mvr[h->mb.i_mb_top_xy] ); SET_MVP( mvr[h->mb.i_mb_topleft_xy] ); SET_MVP( mvr[h->mb.i_mb_topright_xy] ); @@ -438,13 +562,13 @@ x264_frame_t *l0 = h->fref[0][0]; int field = h->mb.i_mb_y&1; int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; - int refpoc = h->fref[i_list][i_ref>>h->sh.b_mbaff]->i_poc; + int refpoc = h->fref[i_list][i_ref>>SLICE_MBAFF]->i_poc; refpoc += l0->i_delta_poc[field^(i_ref&1)]; #define SET_TMVP( dx, dy ) \ { \ int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \ - int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field]; \ + int scale = (curpoc - refpoc) * l0->inv_ref_poc[MB_INTERLACED&field]; \ mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8; \ mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8; \ i++; \

@@ -24,16 +24,16 @@ * For more information, contact us at licensing@x264.com. *****************************************************************************/ -#ifndef __MINGW32__ -#include <sys/time.h> -#else +#include "common.h" + +#if SYS_WINDOWS #include <sys/types.h> #include <sys/timeb.h> +#else +#include <sys/time.h> #endif #include <time.h> -#include "common.h" - #if PTW32_STATIC_LIB #define WIN32_LEAN_AND_MEAN #include <windows.h> @@ -43,14 +43,14 @@ int64_t x264_mdate( void ) { -#ifndef __MINGW32__ - struct timeval tv_date; - gettimeofday( &tv_date, NULL ); - return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec; -#else +#if SYS_WINDOWS struct timeb tb; ftime( &tb ); return ((int64_t)tb.time * 1000 + (int64_t)tb.millitm) * 1000; +#else + struct timeval tv_date; + gettimeofday( &tv_date, NULL ); + return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec; #endif } @@ -89,3 +89,35 @@ return 0; } #endif + +#ifdef __INTEL_COMPILER +/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of + * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30) + * adapted to x264's cpu schema. */ + +// Global variable indicating cpu +int __intel_cpu_indicator = 0; +// CPU dispatcher function +void __intel_cpu_indicator_init( void ) +{ + unsigned int cpu = x264_cpu_detect(); + if( cpu&X264_CPU_AVX ) + __intel_cpu_indicator = 0x20000; + else if( cpu&X264_CPU_SSE42 ) + __intel_cpu_indicator = 0x8000; + else if( cpu&X264_CPU_SSE4 ) + __intel_cpu_indicator = 0x2000; + else if( cpu&X264_CPU_SSSE3 ) + __intel_cpu_indicator = 0x1000; + else if( cpu&X264_CPU_SSE3 ) + __intel_cpu_indicator = 0x800; + else if( cpu&X264_CPU_SSE2 && !(cpu&X264_CPU_SSE2_IS_SLOW) ) + __intel_cpu_indicator = 0x200; + else if( cpu&X264_CPU_SSE ) + __intel_cpu_indicator = 0x80; + else if( cpu&X264_CPU_MMXEXT ) + __intel_cpu_indicator = 8; + else + __intel_cpu_indicator = 1; +} +#endif

@@ -50,6 +50,25 @@ #include <fcntl.h> // _O_BINARY #endif +#ifdef __ICL +#define inline __inline +#define strcasecmp _stricmp +#define strncasecmp _strnicmp +#define snprintf _snprintf +#define strtok_r strtok_s +#define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) +#endif + +#ifdef __INTEL_COMPILER +#include <mathimf.h> +#else +#include <math.h> +#endif + +#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (ARCH_X86 || ARCH_X86_64) +#define HAVE_X86_INLINE_ASM 1 +#endif + #if !defined(isfinite) && (SYS_OPENBSD || SYS_SunOS) #define isfinite finite #endif @@ -60,7 +79,11 @@ #endif #endif +#ifdef __ICL +#define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var +#else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) +#endif #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) #define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) #define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 ) @@ -99,9 +122,14 @@ #define x264_constant_p(x) __builtin_constant_p(x) #define x264_nonconstant_p(x) (!__builtin_constant_p(x)) #else -#define UNUSED +#ifdef __ICL +#define ALWAYS_INLINE __forceinline +#define NOINLINE __declspec(noinline) +#else #define ALWAYS_INLINE inline #define NOINLINE +#endif +#define UNUSED #define MAY_ALIAS #define x264_constant_p(x) 0 #define x264_nonconstant_p(x) 0 @@ -179,19 +207,13 @@ #define asm __asm__ -#if !defined(_WIN64) && !defined(__LP64__) -#if defined(__INTEL_COMPILER) -#define BROKEN_STACK_ALIGNMENT 1 /* define it if stack is not mod16 */ -#endif -#endif - #if WORDS_BIGENDIAN #define endian_fix(x) (x) #define endian_fix64(x) (x) #define endian_fix32(x) (x) #define endian_fix16(x) (x) #else -#if defined(__GNUC__) && HAVE_MMX +#if HAVE_X86_INLINE_ASM && HAVE_MMX static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x ) { asm("bswap %0":"+r"(x)); @@ -209,7 +231,7 @@ return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24); } #endif -#if defined(__GNUC__) && ARCH_X86_64 +#if HAVE_X86_INLINE_ASM && ARCH_X86_64 static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x ) { asm("bswap %0":"+r"(x)); @@ -260,7 +282,7 @@ } #endif -#if defined(__GNUC__) && HAVE_MMX +#if HAVE_X86_INLINE_ASM && HAVE_MMX /* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of * using complex address modes properly unless we use inline asm. */ static ALWAYS_INLINE void x264_prefetch( void *p ) @@ -277,7 +299,7 @@ #endif #if HAVE_POSIXTHREAD -#if SYS_MINGW +#if SYS_WINDOWS #define x264_lower_thread_priority(p)\ {\ x264_pthread_t handle = pthread_self();\ @@ -290,7 +312,7 @@ #else #include <unistd.h> #define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); } -#endif /* SYS_MINGW */ +#endif /* SYS_WINDOWS */ #elif HAVE_WIN32THREAD #define x264_lower_thread_priority(p) SetThreadPriority( GetCurrentThread(), X264_MAX( -2, -p ) ) #else

@@ -641,6 +641,36 @@ return ssim; } +int pixel_vsad( pixel *src, int stride, int height ) +{ + int score = 0; + for( int i = 1; i < height; i++, src += stride ) + for( int j = 0; j < 16; j++ ) + score += abs(src[j] - src[j+stride]); + return score; +} + +int x264_field_vsad( x264_t *h, int mb_x, int mb_y ) +{ + int score_field, score_frame; + int stride = h->fenc->i_stride[0]; + int mb_stride = h->mb.i_mb_stride; + pixel *fenc = h->fenc->plane[0] + 16 * (mb_x + mb_y * stride); + int mb_xy = mb_x + mb_y*mb_stride; + + /* We don't want to analyze pixels outside the frame, as it gives inaccurate results. */ + int mbpair_height = X264_MIN( h->param.i_height - mb_y * 16, 32 ); + score_frame = h->pixf.vsad( fenc, stride, mbpair_height ); + score_field = h->pixf.vsad( fenc, stride*2, mbpair_height >> 1 ); + score_field += h->pixf.vsad( fenc+stride, stride*2, mbpair_height >> 1 ); + + if( mb_x > 0 ) + score_field += 512 - h->mb.field[mb_xy -1]*1024; + if( mb_y > 0 ) + score_field += 512 - h->mb.field[mb_xy-mb_stride]*1024; + + return (score_field < score_frame); +} /**************************************************************************** * successive elimination @@ -746,6 +776,7 @@ pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; pixf->var2_8x8 = pixel_var2_8x8; + pixf->vsad = pixel_vsad; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4; @@ -873,6 +904,7 @@ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext; pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext; + pixf->vsad = x264_pixel_vsad_mmxext; if( cpu&X264_CPU_CACHELINE_32 ) { @@ -921,6 +953,7 @@ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif pixf->var2_8x8 = x264_pixel_var2_8x8_sse2; + pixf->vsad = x264_pixel_vsad_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )

@@ -47,10 +47,12 @@ PIXEL_2x2 = 9, }; -static const struct { +static const struct +{ int w; int h; -} x264_pixel_size[7] = { +} x264_pixel_size[7] = +{ { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, @@ -58,7 +60,8 @@ { 4, 4 } }; -static const uint8_t x264_size2pixel[5][5] = { +static const uint8_t x264_size2pixel[5][5] = +{ { 0, }, { 0, PIXEL_4x4, PIXEL_8x4, 0, 0 }, { 0, PIXEL_4x8, PIXEL_8x8, 0, PIXEL_16x8 }, @@ -79,6 +82,7 @@ x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */ + int (*vsad)( pixel *, int, int ); int (*var2_8x8)( pixel *, int, pixel *, int, int * ); uint64_t (*var[4])( pixel *pix, int stride ); @@ -122,5 +126,6 @@ void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v ); uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ); float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf ); +int x264_field_vsad( x264_t *h, int mb_x, int mb_y ); #endif

@@ -856,6 +856,318 @@ dstc += dst_stride; } } + +static void mc_weight_w2_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, + const x264_weight_t *weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + vec_u8_t srcv; + vec_s16_t weightv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 2, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, roundv ); + weightv = vec_sra( weightv, (vec_u16_t)denomv ); + weightv = vec_add( weightv, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 2, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst ); + } + } +} +static void mc_weight_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, + const x264_weight_t *weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + vec_u8_t srcv; + vec_s16_t weightv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 4, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, roundv ); + weightv = vec_sra( weightv, (vec_u16_t)denomv ); + weightv = vec_add( weightv, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 4, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst ); + } + } +} +static void mc_weight_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, + const x264_weight_t *weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + PREP_STORE8; + vec_u8_t srcv; + vec_s16_t weightv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 8, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, roundv ); + weightv = vec_sra( weightv, (vec_u16_t)denomv ); + weightv = vec_add( weightv, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + VEC_STORE8( srcv, dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 8, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + VEC_STORE8( srcv, dst ); + } + } +} +static void mc_weight_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, + const x264_weight_t *weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + vec_u8_t srcv; + vec_s16_t weight_lv, weight_hv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 16, vec_u8_t, src ); + weight_hv = vec_u8_to_s16_h( srcv ); + weight_lv = vec_u8_to_s16_l( srcv ); + + weight_hv = vec_mladd( weight_hv, scalev, roundv ); + weight_lv = vec_mladd( weight_lv, scalev, roundv ); + weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv ); + weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv ); + weight_hv = vec_add( weight_hv, offsetv ); + weight_lv = vec_add( weight_lv, offsetv ); + + srcv = vec_packsu( weight_hv, weight_lv ); + vec_st( srcv, 0, dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 16, vec_u8_t, src ); + weight_hv = vec_u8_to_s16_h( srcv ); + weight_lv = vec_u8_to_s16_l( srcv ); + + weight_hv = vec_mladd( weight_hv, scalev, offsetv ); + weight_lv = vec_mladd( weight_lv, scalev, offsetv ); + + srcv = vec_packsu( weight_hv, weight_lv ); + vec_st( srcv, 0, dst ); + } + } +} +static void mc_weight_w20_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, + const x264_weight_t *weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD_SRC( src ); + vec_u8_t src_1v, src_2v, src_3v; + vec_s16_t weight_lv, weight_hv, weight_3v; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + src_1v = vec_ld( 0, src ); + src_2v = vec_ld( 16, src ); + src_3v = vec_ld( 19, src ); + src_1v = vec_perm( src_1v, src_2v, _src_ ); + src_3v = vec_perm( src_2v, src_3v, _src_ ); + weight_hv = vec_u8_to_s16_h( src_1v ); + weight_lv = vec_u8_to_s16_l( src_1v ); + weight_3v = vec_u8_to_s16_h( src_3v ); + + weight_hv = vec_mladd( weight_hv, scalev, roundv ); + weight_lv = vec_mladd( weight_lv, scalev, roundv ); + weight_3v = vec_mladd( weight_3v, scalev, roundv ); + weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv ); + weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv ); + weight_3v = vec_sra( weight_3v, (vec_u16_t)denomv ); + weight_hv = vec_add( weight_hv, offsetv ); + weight_lv = vec_add( weight_lv, offsetv ); + weight_3v = vec_add( weight_3v, offsetv ); + + src_1v = vec_packsu( weight_hv, weight_lv ); + src_3v = vec_packsu( weight_3v, zero_s16v ); + vec_st( src_1v, 0, dst ); + vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + src_1v = vec_ld( 0, src ); + src_2v = vec_ld( 16, src ); + src_3v = vec_ld( 19, src ); + src_1v = vec_perm( src_1v, src_2v, _src_ ); + src_3v = vec_perm( src_2v, src_3v, _src_ ); + weight_hv = vec_u8_to_s16_h( src_1v ); + weight_lv = vec_u8_to_s16_l( src_1v ); + weight_3v = vec_u8_to_s16_h( src_3v ); + + weight_hv = vec_mladd( weight_hv, scalev, offsetv ); + weight_lv = vec_mladd( weight_lv, scalev, offsetv ); + weight_3v = vec_mladd( weight_3v, scalev, offsetv ); + + src_1v = vec_packsu( weight_hv, weight_lv ); + src_3v = vec_packsu( weight_3v, zero_s16v ); + vec_st( src_1v, 0, dst ); + vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst ); + } + } +} + +static weight_fn_t x264_mc_weight_wtab_altivec[6] = +{ + mc_weight_w2_altivec, + mc_weight_w4_altivec, + mc_weight_w8_altivec, + mc_weight_w16_altivec, + mc_weight_w16_altivec, + mc_weight_w20_altivec, +}; + #endif // !HIGH_BIT_DEPTH void x264_mc_altivec_init( x264_mc_functions_t *pf ) @@ -870,5 +1182,7 @@ pf->hpel_filter = x264_hpel_filter_altivec; pf->frame_init_lowres_core = frame_init_lowres_core_altivec; + + pf->weight = x264_mc_weight_wtab_altivec; #endif // !HIGH_BIT_DEPTH }

@@ -26,6 +26,7 @@ #include "common/common.h" #include "ppccommon.h" +#include "../predict.h" #if !HIGH_BIT_DEPTH /*********************************************************************** @@ -1983,6 +1984,61 @@ sums[0][3] = temp[0]; sums[1][3] = temp[1]; } + +#define SATD_X( size ) \ +static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\ +{\ + scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\ +}\ +static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\ +{\ + scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\ + scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\ +} +SATD_X( 16x16 )\ +SATD_X( 16x8 )\ +SATD_X( 8x16 )\ +SATD_X( 8x8 )\ +SATD_X( 8x4 )\ +SATD_X( 4x8 )\ +SATD_X( 4x4 ) + + +#define INTRA_MBCMP_8x8( mbcmp )\ +void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[33], int res[3] )\ +{\ + ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\ + x264_predict_8x8_v_c( pix, edge );\ + res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_8x8_h_c( pix, edge );\ + res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_8x8_dc_c( pix, edge );\ + res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ +} + +INTRA_MBCMP_8x8(sad) +INTRA_MBCMP_8x8(sa8d) + +#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\ +void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t *fenc, uint8_t *fdec, int res[3] )\ +{\ + x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\ + res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\ + res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\ + res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ +} + +INTRA_MBCMP(satd, 4, v, h, dc, ) +INTRA_MBCMP(sad, 8, dc, h, v, c ) +INTRA_MBCMP(satd, 8, dc, h, v, c ) +INTRA_MBCMP(sad, 16, v, h, dc, ) +INTRA_MBCMP(satd, 16, v, h, dc, ) #endif // !HIGH_BIT_DEPTH /**************************************************************************** @@ -2014,12 +2070,38 @@ pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec; pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec; + pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec; + pixf->satd_x3[PIXEL_8x16] = pixel_satd_x3_8x16_altivec; + pixf->satd_x3[PIXEL_16x8] = pixel_satd_x3_16x8_altivec; + pixf->satd_x3[PIXEL_8x8] = pixel_satd_x3_8x8_altivec; + pixf->satd_x3[PIXEL_8x4] = pixel_satd_x3_8x4_altivec; + pixf->satd_x3[PIXEL_4x8] = pixel_satd_x3_4x8_altivec; + pixf->satd_x3[PIXEL_4x4] = pixel_satd_x3_4x4_altivec; + + pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec; + pixf->satd_x4[PIXEL_8x16] = pixel_satd_x4_8x16_altivec; + pixf->satd_x4[PIXEL_16x8] = pixel_satd_x4_16x8_altivec; + pixf->satd_x4[PIXEL_8x8] = pixel_satd_x4_8x8_altivec; + pixf->satd_x4[PIXEL_8x4] = pixel_satd_x4_8x4_altivec; + pixf->satd_x4[PIXEL_4x8] = pixel_satd_x4_4x8_altivec; + pixf->satd_x4[PIXEL_4x4] = pixel_satd_x4_4x4_altivec; + + pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_altivec; + pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_altivec; + pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_altivec; + + pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_altivec; + pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_altivec; + pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec; + pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec; pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec; pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec; pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec; + pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_altivec; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_altivec; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_altivec;

@@ -141,6 +141,66 @@ } } +static ALWAYS_INLINE void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf ) +{ + int d0 = dct[0] + dct[1]; + int d1 = dct[2] + dct[3]; + int d2 = dct[0] - dct[1]; + int d3 = dct[2] - dct[3]; + out[0] = (d0 + d1) * dequant_mf >> 5; + out[1] = (d0 - d1) * dequant_mf >> 5; + out[2] = (d2 + d3) * dequant_mf >> 5; + out[3] = (d2 - d3) * dequant_mf >> 5; +} + +static ALWAYS_INLINE int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf ) +{ + dctcoef out[4]; + idct_dequant_2x2_dconly( out, dct, dequant_mf ); + return ((ref[0] ^ (out[0]+32)) + | (ref[1] ^ (out[1]+32)) + | (ref[2] ^ (out[2]+32)) + | (ref[3] ^ (out[3]+32))) >> 6; +} + +static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf ) +{ + /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */ + dctcoef dct_orig[4]; + int coeff, nz; + + idct_dequant_2x2_dconly( dct_orig, dct, dequant_mf ); + dct_orig[0] += 32; + dct_orig[1] += 32; + dct_orig[2] += 32; + dct_orig[3] += 32; + + /* If the DC coefficients already round to zero, terminate early. */ + if( !((dct_orig[0]|dct_orig[1]|dct_orig[2]|dct_orig[3])>>6) ) + return 0; + + /* Start with the highest frequency coefficient... is this the best option? */ + for( nz = 0, coeff = 3; coeff >= 0; coeff-- ) + { + int level = dct[coeff]; + int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */ + + while( level ) + { + dct[coeff] = level - sign; + if( idct_dequant_round_2x2_dc( dct_orig, dct, dequant_mf ) ) + { + nz = 1; + dct[coeff] = level; + break; + } + level -= sign; + } + } + + return nz; +} + static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ) { for( int i = 0; i < size; i++ ) @@ -272,6 +332,8 @@ pf->dequant_4x4_dc = dequant_4x4_dc; pf->dequant_8x8 = dequant_8x8; + pf->optimize_chroma_dc = optimize_chroma_dc; + pf->denoise_dct = x264_denoise_dct; pf->decimate_score15 = x264_decimate_score15; pf->decimate_score16 = x264_decimate_score16; @@ -427,6 +489,7 @@ pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } + pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; @@ -457,6 +520,7 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; + pf->optimize_chroma_dc = x264_optimize_chroma_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; @@ -473,6 +537,7 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; pf->quant_4x4 = x264_quant_4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; + pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse4; } if( cpu&X264_CPU_AVX ) @@ -480,6 +545,7 @@ pf->dequant_4x4 = x264_dequant_4x4_avx; pf->dequant_8x8 = x264_dequant_8x8_avx; pf->dequant_4x4_dc = x264_dequant_4x4dc_avx; + pf->optimize_chroma_dc = x264_optimize_chroma_dc_avx; pf->denoise_dct = x264_denoise_dct_avx; } #endif // HAVE_MMX

@@ -38,6 +38,8 @@ void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); + int (*optimize_chroma_dc)( dctcoef dct[4], int dequant_mf ); + void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); int (*decimate_score15)( dctcoef *dct );

@@ -80,6 +80,15 @@ { /* height 1, width 16 doesn't occur */ assert( h != 1 ); +#if HAVE_VECTOREXT && defined(__SSE__) + v4si v16 = {v,v,v,v}; + + M128( d+s*0+0 ) = (__m128)v16; + M128( d+s*1+0 ) = (__m128)v16; + if( h == 2 ) return; + M128( d+s*2+0 ) = (__m128)v16; + M128( d+s*3+0 ) = (__m128)v16; +#else if( WORD_SIZE == 8 ) { do @@ -103,6 +112,7 @@ d += s; } while( --h ); } +#endif } else assert(0);

@@ -24,7 +24,6 @@ *****************************************************************************/ #define _ISOC99_SOURCE -#include <math.h> #include "common.h" #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s)) @@ -196,7 +195,7 @@ int dct8x8 = cat == 1; int size = dct8x8 ? 64 : 16; udctcoef *nr_offset = h->nr_offset_emergency[q][cat]; - /* Denoise chroma first (due to h264's chroma QP offset, then luma, then DC. */ + /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */ int dc_threshold = (QP_MAX-QP_MAX_SPEC)*2/3; int luma_threshold = (QP_MAX-QP_MAX_SPEC)*2/3; int chroma_threshold = 0; @@ -237,6 +236,10 @@ h->param.rc.i_qp_max = min_qp_err-1; if( max_qp_err >= h->param.rc.i_qp_min ) h->param.rc.i_qp_min = max_qp_err+1; + /* If long level-codes aren't allowed, we need to allow QP high enough to avoid them. */ + if( !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH ) + while( h->chroma_qp_table[SPEC_QP(h->param.rc.i_qp_max)] <= 12 || h->param.rc.i_qp_max <= 12 ) + h->param.rc.i_qp_max++; if( h->param.rc.i_qp_min > h->param.rc.i_qp_max ) { x264_log( h, X264_LOG_ERROR, "Impossible QP constraints for CQM (min=%d, max=%d)\n", h->param.rc.i_qp_min, h->param.rc.i_qp_max );

@@ -695,7 +695,7 @@ vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; -void x264_init_vlc_tables( void ) +void x264_cavlc_init( void ) { for( int i_suffix = 0; i_suffix < 7; i_suffix++ ) for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )

@@ -59,7 +59,7 @@ static x264_win32thread_control_t thread_control; /* _beginthreadex requires that the start routine is __stdcall */ -static __stdcall unsigned x264_win32thread_worker( void *arg ) +static unsigned __stdcall x264_win32thread_worker( void *arg ) { x264_pthread_t *h = arg; h->ret = h->func( h->arg );

@@ -35,13 +35,13 @@ ; t3 must be ecx, since it's used for shift. %ifdef WIN64 - DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2 + DECLARE_REG_TMP 3,1,2,0,4,5,6,2 %define pointer resq %elifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6 + DECLARE_REG_TMP 0,1,2,3,4,5,6,6 %define pointer resq %else - DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2 + DECLARE_REG_TMP 0,4,2,1,3,5,6,2 %define pointer resd %endif @@ -75,21 +75,21 @@ movifnidn t0, r0mp movifnidn t1d, r1m mov t5d, [t0+cb.range] - movzx t4d, byte [t0+cb.state+t1] + movzx t6d, byte [t0+cb.state+t1] + mov t4d, ~1 mov t3d, t5d - mov t6d, t4d + and t4d, t6d shr t5d, 6 - shr t4d, 1 movifnidn t2d, r2m - LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4 + LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2 and t6d, 1 sub t3d, t5d cmp t6d, t2d mov t6d, [t0+cb.low] - lea t7, [t6+t3] + lea t2, [t6+t3] cmovne t3d, t5d - cmovne t6d, t7d + cmovne t6d, t2d mov [t0+cb.state+t1], t4b ;cabac_encode_renorm mov t4d, t3d @@ -108,10 +108,9 @@ cglobal cabac_encode_bypass_asm, 0,3 movifnidn t0, r0mp movifnidn t3d, r1m - neg t3d - mov t8d, [t0+cb.low] + mov t7d, [t0+cb.low] and t3d, [t0+cb.range] - lea t8d, [t8*2+t3] + lea t7d, [t7*2+t3] mov t3d, [t0+cb.queue] inc t3d %ifdef UNIX64 ; .putbyte compiles to nothing but a jmp @@ -119,12 +118,12 @@ %else jge .putbyte %endif - mov [t0+cb.low], t8d + mov [t0+cb.low], t7d mov [t0+cb.queue], t3d RET .putbyte: PROLOGUE 0,7 - movifnidn t6d, t8d + movifnidn t6d, t7d jmp cabac_putbyte cglobal cabac_encode_terminal_asm, 0,3 @@ -163,7 +162,7 @@ mov t5d, [t0+cb.bytes_outstanding] cmp t2b, 0xff ; FIXME is a 32bit op faster? jz .postpone - mov t1, [t0+cb.p] + mov t1, [t0+cb.p] add [t1-1], dh ; t2h dec dh .loop_outstanding:

@@ -51,6 +51,7 @@ const pd_1, times 4 dd 1 const pd_32, times 4 dd 32 +const pd_1024, times 4 dd 1024 const pd_ffff, times 4 dd 0xffff const pw_00ff, times 8 dw 0x00ff const pw_ff00, times 8 dw 0xff00

@@ -1963,7 +1963,7 @@ %define ref r1+scan8start %define mv r2+scan8start*4 %define bs0 r3 -%define bs1 r3+16 +%define bs1 r3+32 %macro LOAD_BYTES_MMX 1 movd m2, [%1+8*0-1]

@@ -40,7 +40,7 @@ pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff -pf_inv256: times 4 dd 0.00390625 +pf_inv256: times 8 dd 0.00390625 pad10: times 8 dw 10*PIXEL_MAX pad20: times 8 dw 20*PIXEL_MAX @@ -1128,7 +1128,7 @@ %endif ; These functions are not general-use; not only do the SSE ones require aligned input, -; but they also will fail if given a non-mod16 size or a size less than 64. +; but they also will fail if given a non-mod16 size. ; memzero SSE will fail for non-mod128. ;----------------------------------------------------------------------------- @@ -1136,12 +1136,15 @@ ;----------------------------------------------------------------------------- cglobal memcpy_aligned_mmx, 3,3 test r2d, 16 - jz .copy32 + jz .copy32start sub r2d, 16 movq mm0, [r1 + r2 + 0] movq mm1, [r1 + r2 + 8] movq [r0 + r2 + 0], mm0 movq [r0 + r2 + 8], mm1 +.copy32start + test r2d, r2d + jz .ret .copy32: sub r2d, 32 movq mm0, [r1 + r2 + 0] @@ -1153,6 +1156,7 @@ movq [r0 + r2 + 16], mm2 movq [r0 + r2 + 24], mm3 jg .copy32 +.ret REP_RET ;----------------------------------------------------------------------------- @@ -1166,12 +1170,15 @@ movdqa [r0 + r2], xmm0 .copy32: test r2d, 32 - jz .copy64 + jz .copy64start sub r2d, 32 movdqa xmm0, [r1 + r2 + 0] movdqa [r0 + r2 + 0], xmm0 movdqa xmm1, [r1 + r2 + 16] movdqa [r0 + r2 + 16], xmm1 +.copy64start + test r2d, r2d + jz .ret .copy64: sub r2d, 64 movdqa xmm0, [r1 + r2 + 0] @@ -1183,6 +1190,7 @@ movdqa xmm3, [r1 + r2 + 48] movdqa [r0 + r2 + 48], xmm3 jg .copy64 +.ret: REP_RET ;----------------------------------------------------------------------------- @@ -1622,7 +1630,7 @@ ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) ;----------------------------------------------------------------------------- cglobal mbtree_propagate_cost_sse2, 7,7,7 - shl r6d, 1 + add r6d, r6d lea r0, [r0+r6*2] add r1, r6 add r2, r6 @@ -1665,3 +1673,49 @@ jl .loop REP_RET +%macro INT16_TO_FLOAT 1 + vpunpckhwd xmm4, xmm%1, xmm7 + vpunpcklwd xmm%1, xmm7 + vinsertf128 ymm%1, ymm%1, xmm4, 1 + vcvtdq2ps ymm%1, ymm%1 +%endmacro + +; FIXME: align loads/stores to 16 bytes +cglobal mbtree_propagate_cost_avx, 7,7,8 + add r6d, r6d + lea r0, [r0+r6*2] + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + neg r6 + vmovdqa xmm5, [pw_3fff] + vbroadcastss ymm6, [r5] + vmulps ymm6, ymm6, [pf_inv256] + vpxor xmm7, xmm7 +.loop: + vmovdqu xmm0, [r2+r6] ; intra + vmovdqu xmm1, [r4+r6] ; invq + vmovdqu xmm2, [r1+r6] ; prop + vpand xmm3, xmm5, [r3+r6] ; inter + INT16_TO_FLOAT 0 + INT16_TO_FLOAT 1 + INT16_TO_FLOAT 2 + INT16_TO_FLOAT 3 + vmulps ymm1, ymm1, ymm0 + vsubps ymm4, ymm0, ymm3 + vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8 + vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8) + vrcpps ymm3, ymm0 ; 1 / intra 1st approximation + vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx) + vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2 + vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx) + vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra + vmulps ymm1, ymm1, ymm3 ; / intra + vcvtps2dq ymm1, ymm1 + vmovdqu [r0+r6*2], ymm1 + add r6, 16 + jl .loop + vzeroupper + RET

@@ -140,6 +140,8 @@ void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\ @@ -728,4 +730,8 @@ if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; #endif // HIGH_BIT_DEPTH + + if( !(cpu&X264_CPU_AVX) ) + return; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; }

@@ -138,6 +138,8 @@ int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * ); int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * ); int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * ); +int x264_pixel_vsad_mmxext( pixel *src, int stride, int height ); +int x264_pixel_vsad_sse2( pixel *src, int stride, int height ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\

@@ -180,7 +180,7 @@ PREDICT_16x16_P( avx ) #endif //!HIGH_BIT_DEPTH -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH static void x264_predict_16x16_p_sse2( uint16_t *src ) #else @@ -191,10 +191,10 @@ int H, V; #if HIGH_BIT_DEPTH asm ( - "movdqu -2+%1, %%xmm1 \n" - "movdqa 16+%1, %%xmm0 \n" - "pmaddwd %2, %%xmm0 \n" - "pmaddwd %3, %%xmm1 \n" + "movdqu %1, %%xmm1 \n" + "movdqa %2, %%xmm0 \n" + "pmaddwd %3, %%xmm0 \n" + "pmaddwd %4, %%xmm1 \n" "paddd %%xmm1, %%xmm0 \n" "movhlps %%xmm0, %%xmm1 \n" "paddd %%xmm1, %%xmm0 \n" @@ -202,24 +202,26 @@ "paddd %%xmm1, %%xmm0 \n" "movd %%xmm0, %0 \n" :"=r"(H) - :"m"(src[-FDEC_STRIDE]), "m"(*pw_12345678), "m"(*pw_m87654321) + :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]), + "m"(*pw_12345678), "m"(*pw_m87654321) ); #else asm ( "movq %1, %%mm1 \n" - "movq 8+%1, %%mm0 \n" - "palignr $7, -8+%1, %%mm1 \n" - "pmaddubsw %2, %%mm0 \n" - "pmaddubsw %3, %%mm1 \n" + "movq %2, %%mm0 \n" + "palignr $7, %3, %%mm1 \n" + "pmaddubsw %4, %%mm0 \n" + "pmaddubsw %5, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "pshufw $14, %%mm0, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "pshufw $1, %%mm0, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "movd %%mm0, %0 \n" - "movsx %w0, %0 \n" + "movswl %w0, %0 \n" :"=r"(H) - :"m"(src[-FDEC_STRIDE]), "m"(*pb_12345678), "m"(*pb_m87654321) + :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]), + "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321) ); #endif V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] ) @@ -269,7 +271,7 @@ #endif //!HIGH_BIT_DEPTH -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH static void x264_predict_8x8c_p_sse2( uint16_t *src ) #else @@ -299,7 +301,7 @@ "pshufw $1, %%mm0, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "movd %%mm0, %0 \n" - "movsx %w0, %0 \n" + "movswl %w0, %0 \n" :"=r"(H) :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234) ); @@ -430,7 +432,9 @@ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2; pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2; pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2; +#if HAVE_X86_INLINE_ASM pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2; +#endif #else #if !ARCH_X86_64 pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext; @@ -447,7 +451,7 @@ if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3; -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3; #endif if( !(cpu&X264_CPU_AVX) ) @@ -471,7 +475,9 @@ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2; +#if HAVE_X86_INLINE_ASM pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2; +#endif #else #if ARCH_X86_64 pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left; @@ -491,7 +497,7 @@ if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3; -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3; #endif #endif // HIGH_BIT_DEPTH

@@ -7,6 +7,7 @@ ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* Christian Heine <sennindemokrit@gmx.net> ;* Oskar Arvidsson <oskar@irock.se> +;* Henrik Gramner <hengar-6@student.ltu.se> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -69,12 +70,18 @@ db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24 +chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1 +chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0 +chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1 +chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1 + SECTION .text cextern pb_1 cextern pw_1 cextern pd_1 cextern pb_01 +cextern pd_1024 %macro QUANT_DC_START_MMX 0 movd m6, r1m ; mf @@ -117,12 +124,18 @@ psignw %1, %2 %endmacro -%macro PSIGND_MMX 2 +%macro PSIGND_MMX 2-3 +%if %0==3 + mova %1, %2 + pxor %1, %3 + psubd %1, %3 +%else pxor %1, %2 psubd %1, %2 +%endif %endmacro -%macro PSIGND_SSSE3 2 +%macro PSIGND_SSSE3 2+ psignd %1, %2 %endmacro @@ -747,6 +760,126 @@ DEQUANT_DC avx , w %endif +; t4 is eax for return value. +%ifdef ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX +%else + DECLARE_REG_TMP 4,1,2,3,0,5 +%endif + +;----------------------------------------------------------------------------- +; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf ) +;----------------------------------------------------------------------------- + +; %2 == 1 for sse2 or ssse3, 0 for sse4/avx +%macro OPTIMIZE_CHROMA_DC 2 +%assign %%regs 4+%2 +%ifndef ARCH_X86_64 + %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64 +%endif +cglobal optimize_chroma_dc_%1, 0,%%regs,7 + movifnidn t0, r0mp + movd m2, r1m + movq m1, [t0] +%if %2 + pxor m4, m4 +%else ; sse4, avx + pcmpeqb m4, m4 + pslld m4, 11 +%endif +%ifidn %1, sse2 + mova m3, [chroma_dc_dct_mask_mmx] + mova m5, [chroma_dc_dmf_mask_mmx] +%else + mova m3, [chroma_dc_dct_mask] + mova m5, [chroma_dc_dmf_mask] +%endif + pshuflw m2, m2, 0 + pshufd m0, m1, 00010001b ; 1 0 3 2 1 0 3 2 + punpcklqdq m2, m2 + punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0 + mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left + PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2 + PSIGNW m2, m5 ; + - - + - - + + + paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2 + pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf + punpcklwd m1, m1 + psrad m2, 16 ; + - - + + mov t1d, 3 + paddd m0, m6 + xor t4d, t4d +%ifidn %1, sse2 + psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly +%endif +%if %2 + mova m6, m0 + SWAP 0, 6 + psrad m6, 11 + pcmpeqd m6, m4 + pmovmskb t5d, m6 + cmp t5d, 0xffff +%else ; sse4, avx + ptest m0, m4 +%endif + jz .ret ; if the DC coefficients already round to zero, terminate early + mova m3, m0 +.outer_loop: + movsx t3d, word [t0+2*t1] ; dct[coeff] + pshufd m6, m1, 11111111b + pshufd m1, m1, 10010000b ; move the next element to high dword + PSIGND m5, m2, m6 + test t3d, t3d + jz .loop_end +.outer_loop_0: + mov t2d, t3d + sar t3d, 31 + or t3d, 1 +.inner_loop: + psubd m3, m5 ; coeff -= sign + pxor m6, m0, m3 +%if %2 + psrad m6, 11 + pcmpeqd m6, m4 + pmovmskb t5d, m6 + cmp t5d, 0xffff +%else ; sse4, avx + ptest m6, m4 +%endif + jz .round_coeff + paddd m3, m5 ; coeff += sign + mov t4d, 1 +.loop_end: + dec t1d + jz .last_coeff + pshufd m2, m2, 01111000b ; - + - + / - - + + + jg .outer_loop +.ret: + REP_RET +.round_coeff: + sub t2d, t3d + mov [t0+2*t1], t2w + jnz .inner_loop + jmp .loop_end +.last_coeff: + movsx t3d, word [t0] + punpcklqdq m2, m2 ; + + + + + PSIGND m5, m2, m1 + test t3d, t3d + jnz .outer_loop_0 + REP_RET +%endmacro + +INIT_XMM +%define PSIGNW PSIGNW_MMX +%define PSIGND PSIGND_MMX +OPTIMIZE_CHROMA_DC sse2, 1 +%define PSIGNW PSIGNW_SSSE3 +%define PSIGND PSIGND_SSSE3 +OPTIMIZE_CHROMA_DC ssse3, 1 +OPTIMIZE_CHROMA_DC sse4, 0 +INIT_AVX +OPTIMIZE_CHROMA_DC avx, 0 + %ifdef HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )

@@ -57,6 +57,10 @@ void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +int x264_optimize_chroma_dc_sse2( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_dc_ssse3( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_dc_sse4( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_dc_avx( dctcoef dct[4], int dequant_mf ); void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );

@@ -273,6 +273,71 @@ RET ;----------------------------------------------------------------------------- +; void pixel_vsad( pixel *src, int stride ); +;----------------------------------------------------------------------------- + +%ifndef ARCH_X86_64 +INIT_MMX +cglobal pixel_vsad_mmxext, 3,3 + mova m0, [r0] + mova m1, [r0+8] + mova m2, [r0+r1] + mova m3, [r0+r1+8] + lea r0, [r0+r1*2] + psadbw m0, m2 + psadbw m1, m3 + paddw m0, m1 + sub r2d, 2 + je .end +.loop: + mova m4, [r0] + mova m5, [r0+8] + mova m6, [r0+r1] + mova m7, [r0+r1+8] + lea r0, [r0+r1*2] + psadbw m2, m4 + psadbw m3, m5 + psadbw m4, m6 + psadbw m5, m7 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + mova m2, m6 + mova m3, m7 + sub r2d, 2 + jg .loop +.end: + movd eax, m0 + RET +%endif + +INIT_XMM +cglobal pixel_vsad_sse2, 3,3 + mova m0, [r0] + mova m1, [r0+r1] + lea r0, [r0+r1*2] + psadbw m0, m1 + sub r2d, 2 + je .end +.loop: + mova m2, [r0] + mova m3, [r0+r1] + lea r0, [r0+r1*2] + psadbw m1, m2 + psadbw m2, m3 + paddw m0, m1 + paddw m0, m2 + mova m1, m3 + sub r2d, 2 + jg .loop +.end: + movhlps m1, m0 + paddw m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;-----------------------------------------------------------------------------

@@ -27,11 +27,19 @@ #ifndef X264_X86_UTIL_H #define X264_X86_UTIL_H -#ifdef __GNUC__ - #ifdef __SSE__ #include <xmmintrin.h> + +#undef M128_ZERO +#define M128_ZERO ((__m128){0,0,0,0}) +#define x264_union128_t x264_union128_sse_t +typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t; +#if HAVE_VECTOREXT +typedef uint32_t v4si __attribute__((vector_size (16))); #endif +#endif // __SSE__ + +#if HAVE_X86_INLINE_ASM && HAVE_MMX #define x264_median_mv x264_median_mv_mmxext static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c ) @@ -92,11 +100,13 @@ { static const uint64_t pb_2 = 0x0202020202020202ULL; static const uint64_t pb_32 = 0x2020202020202020ULL; + static const uint64_t pb_33 = 0x2121212121212121ULL; int amvd; asm( "movd %1, %%mm0 \n" "movd %2, %%mm1 \n" - "paddb %%mm1, %%mm0 \n" + "paddusb %%mm1, %%mm0 \n" + "pminub %5, %%mm0 \n" "pxor %%mm2, %%mm2 \n" "movq %%mm0, %%mm1 \n" "pcmpgtb %3, %%mm0 \n" @@ -106,7 +116,7 @@ "movd %%mm2, %0 \n" :"=r"(amvd) :"m"(M16( mvdleft )),"m"(M16( mvdtop )), - "m"(pb_2),"m"(pb_32) + "m"(pb_2),"m"(pb_32),"m"(pb_33) ); return amvd; } @@ -149,13 +159,6 @@ ); } -#ifdef __SSE__ -#undef M128_ZERO -#define M128_ZERO ((__m128){0,0,0,0}) -#define x264_union128_t x264_union128_sse_t -typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t; -#endif - #endif #endif

@@ -7,6 +7,10 @@ available options: --help print this message + --disable-cli disables cli + --system-libx264 use system libx264 instead of internal + --enable-shared build shared library + --enable-static build static library --disable-avs disables avisynth support (windows only) --disable-lavf disables libavformat support --disable-ffms disables ffmpegsource support @@ -16,11 +20,12 @@ --enable-win32thread use win32threads (windows only) --disable-swscale disables swscale support --disable-asm disables platform-specific assembly optimizations - --enable-debug adds -g, doesn't strip - --enable-gprof adds -pg, doesn't strip + --disable-interlaced disables interlaced encoding support + --enable-debug adds -g + --enable-gprof adds -pg + --enable-strip adds -s --enable-visualize enables visualization (X11 only) --enable-pic build position-independent code - --enable-shared build shared library --bit-depth=BIT_DEPTH sets output bit depth (8-10), default 8 --extra-asflags=EASFLAGS add EASFLAGS to ASFLAGS --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS @@ -49,6 +54,45 @@ echo "$1" >> config.log } +intel_cflags() { + # Intel Compiler issues an incredibly large number of warnings on any warning level, + # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them + for arg in $*; do + [ $arg = -ffast-math ] && arg= + [[ "$arg" = -falign-loops* ]] && arg= + [ "$arg" = -fno-tree-vectorize ] && arg= + [ "$arg" = -Wshadow ] && arg= + if [ $compiler = ICL ]; then + [ "$arg" = -Wall ] && arg=-W0 + [ "$arg" = -g ] && arg=-Z7 + [ "$arg" = -fomit-frame-pointer ] && arg= + [ "$arg" = -s ] && arg= + [ "$arg" = -fPIC ] && arg= + else + [ "$arg" = -Wall ] && arg=-w0 + fi + + [ -n "$arg" ] && echo -n "$arg " + done +} + +icl_ldflags() { + for arg in $*; do + arg=${arg/LIBPATH/libpath} + [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib + [ ${arg#-L} != $arg ] && arg=-libpath:${arg#-L} + [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware + [ $arg = -s ] && arg= + [ "$arg" = -Wl,-Bsymbolic ] && arg= + + arg=${arg/pthreadGC/pthreadVC} + [ "$arg" = avifil32.lib ] && arg=vfw32.lib + [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib + + [ -n "$arg" ] && echo -n "$arg " + done +} + cc_check() { if [ -z "$3" ]; then if [ -z "$1$2" ]; then @@ -59,14 +103,23 @@ log_check "for $1" fi elif [ -z "$1" ]; then - log_check "whether $CC supports $3" + if [ -z "$2" ]; then + log_check "whether $CC supports $3" + else + log_check "whether $CC supports $3 with $2" + fi else log_check "for $3 in $1"; fi rm -f conftest.c [ -n "$1" ] && echo "#include <$1>" > conftest.c echo "int main () { $3 return 0; }" >> conftest.c - if $CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest >conftest.log 2>&1; then + if [ $compiler = ICL ]; then + cc_cmd="$CC conftest.c $CFLAGS $2 -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" + else + cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" + fi + if $cc_cmd >conftest.log 2>&1; then res=$? log_ok else @@ -74,7 +127,7 @@ log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" - log_msg "$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS" + log_msg "$cc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" @@ -140,7 +193,7 @@ exit 1 } -rm -f x264_config.h config.h config.mak config.log x264.pc conftest* +rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest* prefix='/usr/local' exec_prefix='${prefix}' @@ -149,6 +202,10 @@ includedir='${prefix}/include' DEVNULL='/dev/null' +cli="yes" +cli_libx264="internal" +shared="no" +static="no" avs="auto" lavf="auto" ffms="auto" @@ -157,12 +214,14 @@ thread="auto" swscale="auto" asm="auto" +interlaced="yes" debug="no" gprof="no" +strip="no" pic="no" vis="no" -shared="no" bit_depth="8" +compiler="GNU" CFLAGS="$CFLAGS -Wall -I." LDFLAGS="$LDFLAGS" @@ -174,7 +233,7 @@ EXE="" # list of all preprocessor HAVE values we can define -CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL" +CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED" # parse options @@ -196,9 +255,24 @@ --includedir=*) includedir="$optarg" ;; + --disable-cli) + cli="no" + ;; + --system-libx264) + cli_libx264="system" + ;; + --enable-shared) + shared="yes" + ;; + --enable-static) + static="yes" + ;; --disable-asm) asm="no" ;; + --disable-interlaced) + interlaced="no" + ;; --disable-avs) avs="no" ;; @@ -240,12 +314,12 @@ LDFLAGS="$LDFLAGS -pg" gprof="yes" ;; + --enable-strip) + strip="yes" + ;; --enable-pic) pic="yes" ;; - --enable-shared) - shared="yes" - ;; --enable-visualize) vis="yes" ;; @@ -273,6 +347,8 @@ esac done +[ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static." + CC="${CC-${cross_prefix}gcc}" AR="${AR-${cross_prefix}ar}" RANLIB="${RANLIB-${cross_prefix}ranlib}" @@ -290,6 +366,26 @@ host_vendor="${host%%-*}" host_os="${host#*-}" +# test for use of Intel Compiler +if [[ $host_os = mingw* || $host_os = cygwin* ]]; then + if [[ `basename "$CC"` = icl* ]]; then + # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths. + [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS" + compiler=ICL + CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -Iextras" + QPRE="-Q" + `$CC 2>&1 | grep -q IA-32` && host_cpu=i486 + `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64 + cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" + fi +else + if [[ `basename "$CC"` = icc* ]]; then + AR="xiar" + compiler=ICC + QPRE="-" + fi +fi + case $host_os in beos*) SYS="BEOS" @@ -326,16 +422,21 @@ LDFLAGS="$LDFLAGS -lm" ;; cygwin*) - SYS="MINGW" EXE=".exe" - DEVNULL="NUL" if cc_check "" -mno-cygwin; then CFLAGS="$CFLAGS -mno-cygwin" LDFLAGS="$LDFLAGS -mno-cygwin" fi + if cpp_check "" "" "defined(__CYGWIN32__)" ; then + define HAVE_MALLOC_H + SYS="CYGWIN" + else + SYS="WINDOWS" + DEVNULL="NUL" + fi ;; mingw*) - SYS="MINGW" + SYS="WINDOWS" EXE=".exe" DEVNULL="NUL" ;; @@ -355,15 +456,31 @@ ARCH="X86" AS="yasm" ASFLAGS="$ASFLAGS -O2" - if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then - CFLAGS="$CFLAGS -march=i686" - fi - if [[ "$asm" == auto && "$CFLAGS" != *-mfpmath* ]]; then - CFLAGS="$CFLAGS -mfpmath=sse -msse" + if [ $compiler = GNU ]; then + if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then + CFLAGS="$CFLAGS -march=i686" + fi + if [[ "$asm" == auto && "$CFLAGS" != *-mfpmath* ]]; then + CFLAGS="$CFLAGS -mfpmath=sse -msse" + fi + else + # icc on linux has various degrees of mod16 stack support + if [ $SYS = LINUX ]; then + # < 11 is completely incapable of keeping a mod16 stack + if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then + define BROKEN_STACK_ALIGNMENT + # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so. + elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then + CFLAGS="$CFLAGS -falign-stack=assume-16-byte" + fi + # >= 12 defaults to a mod16 stack + fi + # icl on windows has no mod16 stack support + [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho -DPREFIX" - elif [ "$SYS" = MINGW ]; then + elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" else @@ -379,9 +496,10 @@ CFLAGS="$CFLAGS -arch x86_64" LDFLAGS="$LDFLAGS -arch x86_64" fi - elif [ "$SYS" = MINGW ]; then + elif [ "$SYS" = WINDOWS ]; then ASFLAGS="$ASFLAGS -f win32 -m amd64" - cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" + # only the GNU toolchain is inconsistent in prefixing function names with _ + [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" else ASFLAGS="$ASFLAGS -f elf -m amd64" fi @@ -460,10 +578,14 @@ cc_check || die "No working C compiler found." -if cc_check '' -std=gnu99 ; then - CFLAGS="$CFLAGS -std=gnu99" -elif cc_check '' -std=c99 ; then - CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE" +if [ $compiler != ICL ]; then + if cc_check '' -std=gnu99 'for( int i = 0; i < 9; i++ );' ; then + CFLAGS="$CFLAGS -std=gnu99" + elif cc_check '' -std=c99 'for( int i = 0; i < 9; i++ );' ; then + CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE" + elif ! cc_check '' '' 'for( int i = 0; i < 9; i++ );' ; then + die "C99 compiler is needed for compilation." + fi fi if [ $shared = yes -a $ $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" $ ] ; then @@ -472,14 +594,14 @@ if [ $asm = auto -a $ $ARCH = X86 -o $ARCH = X86_64 $ ] ; then if ! as_check "vpaddw xmm0, xmm0, xmm0" ; then - VER=`($AS --version || echo no assembler) 2>$DEVNULL | head -n 1` + VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1` echo "Found $VER" echo "Minimum version is yasm-0.7.0" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi if ! cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' ; then - VER=`(${cross_prefix}as --version || echo no gnu as) 2>$DEVNULL | head -n 1` + VER=`(${cross_prefix}as --version || echo no gnu as) 2>/dev/null | head -n 1` echo "Found $VER" echo "Minimum version is binutils-2.17" echo "Your compiler can't handle inline SSSE3 asm." @@ -510,18 +632,21 @@ define ARCH_$ARCH define SYS_$SYS -echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c -$CC $CFLAGS conftest.c -c -o conftest.o 2>$DEVNULL || die "endian test failed" -if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then - define WORDS_BIGENDIAN -elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then - die "endian test failed" +# skip endianness check for Intel Compiler, as all supported platforms are little. the -ipo flag will also cause the check to fail +if [ $compiler = GNU ]; then + echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c + $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed" + if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then + define WORDS_BIGENDIAN + elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then + die "endian test failed" + fi fi # autodetect options that weren't forced nor disabled # pthread-win32 is lgpl, prevent its use if --disable-gpl is specified and targeting windows -[ "$SYS" = "MINGW" -a "$gpl" = "no" -a "$thread" = "auto" ] && thread="win32" +[ "$SYS" = "WINDOWS" -a "$gpl" = "no" -a "$thread" = "auto" ] && thread="win32" libpthread="" if [ "$thread" = "auto" ]; then @@ -531,7 +656,7 @@ thread="beos" define HAVE_BEOSTHREAD ;; - MINGW) + WINDOWS) if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then thread="posix" libpthread="-lpthread" @@ -564,7 +689,8 @@ define HAVE_POSIXTHREAD fi if [ "$thread" = "win32" ]; then - if [ "$SYS" = "MINGW" ]; then + # cygwin does not support win32 threads + if [ "$SYS" = "WINDOWS" ]; then define HAVE_WIN32THREAD else thread="no" @@ -590,30 +716,24 @@ if [ "$swscale" = "auto" ] ; then swscale="no" - if ${cross_prefix}pkg-config --exists libswscale 2>$DEVNULL; then + if ${cross_prefix}pkg-config --exists libswscale 2>/dev/null; then SWSCALE_LIBS="$SWSCALE_LIBS $(${cross_prefix}pkg-config --libs libswscale)" SWSCALE_CFLAGS="$SWSCALE_CFLAGS $(${cross_prefix}pkg-config --cflags libswscale)" fi [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil" - error="swscale must be at least version 0.9.0" - if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_getContext(0,0,0,0,0,0,0,0,0,0);" ; then - if cpp_check "libswscale/swscale.h" "$SWSCALE_CFLAGS" "LIBSWSCALE_VERSION_INT >= AV_VERSION_INT(0,9,0)" "$error"; then - # we use colorspaces that were defined in libavutil r19775 - if cc_check "libavutil/pixfmt.h" "$SWSCALE_CFLAGS" "enum PixelFormat pixfmt = PIX_FMT_YUV422P16LE;" ; then - swscale="yes" - else - echo "Warning: libavutil is too old, update to ffmpeg r19775+" - fi + if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then + if cc_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "av_get_pix_fmt_name(0);" ; then + swscale="yes" else - echo "Warning: ${error}" + echo "Warning: av_get_pix_fmt_name is missing from libavutil, update for swscale support" fi fi fi if [ "$lavf" = "auto" ] ; then lavf="no" - if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>$DEVNULL; then + if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>/dev/null; then LAVF_LIBS="$LAVF_LIBS $(${cross_prefix}pkg-config --libs libavformat libavcodec libavutil libswscale)" LAVF_CFLAGS="$LAVF_CFLAGS $(${cross_prefix}pkg-config --cflags libavformat libavcodec libavutil libswscale)" fi @@ -625,15 +745,14 @@ fi LAVF_LIBS="-L. $LAVF_LIBS" if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avcodec_decode_video2(0,0,0,0);" ; then - # libvautil/pixdesc.h included the private header intreadwrite.h until r21854 - if cc_check libavutil/pixdesc.h "$LAVF_CFLAGS $LAVF_LIBS" ; then + if cpp_check libavcodec/avcodec.h "$LAVF_CFLAGS $LAVF_LIBS" "LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(52,64,0)" ; then if [ "$swscale" = "yes" ]; then lavf="yes" else echo "Warning: libavformat is not supported without swscale support" fi else - echo "Warning: libavutil is too old, update to ffmpeg r21854+" + echo "Warning: libavcodec is too old, update to ffmpeg r22735+" fi fi fi @@ -642,7 +761,7 @@ ffms_major="2"; ffms_minor="14"; ffms_micro="0"; ffms_bump="0" ffms="no" - if ${cross_prefix}pkg-config --exists ffms2 2>$DEVNULL; then + if ${cross_prefix}pkg-config --exists ffms2 2>/dev/null; then FFMS2_LIBS="$FFMS2_LIBS $(${cross_prefix}pkg-config --libs ffms2)" FFMS2_CFLAGS="$FFMS2_CFLAGS $(${cross_prefix}pkg-config --cflags ffms2)" fi @@ -682,12 +801,12 @@ fi fi -GPAC_LIBS="-lgpac_static" -if [ $SYS = MINGW ]; then - GPAC_LIBS="$GPAC_LIBS -lwinmm" -fi if [ "$gpac" = "auto" ] ; then gpac="no" + cc_check "" -lz && GPAC_LIBS="-lgpac_static -lz" || GPAC_LIBS="-lgpac_static" + if [ "$SYS" = "WINDOWS" ] ; then + GPAC_LIBS="$GPAC_LIBS -lwinmm" + fi if cc_check gpac/isomedia.h "$GPAC_LIBS" ; then if cc_check gpac/isomedia.h "$GPAC_LIBS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0);" ; then gpac="yes" @@ -706,12 +825,15 @@ if [ "$avs" = "auto" ] ; then avs="no" - if [ $SYS = MINGW ] && cc_check extras/avisynth_c.h ; then + # cygwin can use avisynth if it can use LoadLibrary + if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then avs="yes" define HAVE_AVS fi fi +cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT + if [ "$pic" = "yes" ] ; then CFLAGS="$CFLAGS -fPIC" ASFLAGS="$ASFLAGS -DPIC" @@ -720,7 +842,11 @@ fi if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then - CFLAGS="$CFLAGS -s -fomit-frame-pointer" + CFLAGS="$CFLAGS -fomit-frame-pointer" +fi + +if [ "$strip" = "yes" ]; then + CFLAGS="$CFLAGS -s" LDFLAGS="$LDFLAGS -s" fi @@ -738,7 +864,7 @@ CFLAGS="$CFLAGS -fno-tree-vectorize" fi -if [ $SYS = MINGW -a $ARCH = X86 ] ; then +if [ $SYS = WINDOWS -a $ARCH = X86 -a $compiler = GNU ] ; then # workaround gcc/ld bug with alignment of static variables/arrays that are initialized to zero cc_check '' -fno-zero-initialized-in-bss && CFLAGS="$CFLAGS -fno-zero-initialized-in-bss" fi @@ -749,6 +875,9 @@ elif cc_check "stdio.h" "" "fseeko64(stdin,0,0);" ; then define fseek fseeko64 define ftell ftello64 +elif cc_check "stdio.h" "" "_fseeki64(stdin,0,0);" ; then + define fseek _fseeki64 + define ftell _ftelli64 fi if cc_check '' -Wshadow ; then @@ -764,18 +893,60 @@ [ $gpl = yes ] && define HAVE_GPL && x264_gpl=1 || x264_gpl=0 +[ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0 + #define undefined vars as 0 for var in $CONFIG_HAVE; do grep -q "HAVE_$var 1" config.h || define HAVE_$var 0 done +if [ $compiler = ICL ]; then + AR="xilib -nologo -out:" + DEPMM=-QMM + DEPMT=-QMT + HAVE_GETOPT_LONG=0 + LD="xilink -out:" + LDFLAGS="-nologo -incremental:no $(icl_ldflags $LDFLAGS)" + LDFLAGSCLI="$(icl_ldflags $LDFLAGSCLI)" + LIBX264=libx264.lib + RANLIB= + STRIP= + if [ $debug = yes ]; then + LDFLAGS="-debug $LDFLAGS" + CFLAGS="-D_DEBUG $CFLAGS" + else + CFLAGS="-DNDEBUG $CFLAGS" + fi +else + AR="$AR rc " + DEPMM="-MM -g0" + DEPMT="-MT" + LD="$CC -o " + LIBX264=libx264.a +fi +if [ $compiler = GNU ]; then + PROF_GEN_CC="-fprofile-generate" + PROF_GEN_LD="-fprofile-generate" + PROF_USE_CC="-fprofile-use" + PROF_USE_LD="-fprofile-use" +else + CFLAGS="$(intel_cflags $CFLAGS)" + # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP + [ $ $ARCH = X86_64 -o $ARCH = X86 $ -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__ + PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir." + PROF_GEN_LD= + PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir." + PROF_USE_LD= +fi + rm -f conftest* # generate exported config file cat > x264_config.h << EOF -#define X264_BIT_DEPTH $bit_depth -#define X264_GPL $x264_gpl +#define X264_BIT_DEPTH $bit_depth +#define X264_GPL $x264_gpl +#define X264_INTERLACED $x264_interlaced EOF # generate config files @@ -790,8 +961,11 @@ SYS=$SYS CC=$CC CFLAGS=$CFLAGS +DEPMM=$DEPMM +DEPMT=$DEPMT +LD=$LD LDFLAGS=$LDFLAGS -LDFLAGSCLI=$LDFLAGSCLI +LIBX264=$LIBX264 AR=$AR RANLIB=$RANLIB STRIP=$STRIP @@ -800,30 +974,74 @@ EXE=$EXE HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG DEVNULL=$DEVNULL +PROF_GEN_CC=$PROF_GEN_CC +PROF_GEN_LD=$PROF_GEN_LD +PROF_USE_CC=$PROF_USE_CC +PROF_USE_LD=$PROF_USE_LD EOF +if [ $compiler = ICL ]; then + echo '%.o: %.c' >> config.mak + echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak +fi + +if [ "$cli" = "yes" ]; then + echo 'default: cli' >> config.mak + echo 'install: install-cli' >> config.mak +fi + if [ "$shared" = "yes" ]; then API=$(grep '#define X264_BUILD' < x264.h | cut -f 3 -d ' ') - if [ "$SYS" = "MINGW" ]; then + if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then echo "SONAME=libx264-$API.dll" >> config.mak - echo 'IMPLIBNAME=libx264.dll.a' >> config.mak - echo 'SOFLAGS=-Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak + if [ $compiler = ICL ]; then + echo 'IMPLIBNAME=libx264.dll.lib' >> config.mak + # GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations + # MSVC link does not act similarly, so it is required to make an export definition out of x264.h and use it at link time + echo 'SOFLAGS=-dll -def:x264.def -implib:$(IMPLIBNAME)' >> config.mak + echo "EXPORTS" > x264.def + grep "^$int\|void\|x264_t\|extern$.*x264.*[\[(;]" x264.h | sed -e "s/.*$x264.*$[\[(].*/\1/;s/.*$x264.*$;/\1/;s/open/open_$API/g" >> x264.def + else + echo 'IMPLIBNAME=libx264.dll.a' >> config.mak + echo 'SOFLAGS=-shared -Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak + fi elif [ "$SYS" = "MACOSX" ]; then echo "SOSUFFIX=dylib" >> config.mak echo "SONAME=libx264.$API.dylib" >> config.mak - echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak + echo 'SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak elif [ "$SYS" = "SunOS" ]; then echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak - echo 'SOFLAGS=-Wl,-h,$(SONAME)' >> config.mak + echo 'SOFLAGS=-shared -Wl,-h,$(SONAME)' >> config.mak else echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak - echo 'SOFLAGS=-Wl,-soname,$(SONAME)' >> config.mak + echo 'SOFLAGS=-shared -Wl,-soname,$(SONAME)' >> config.mak fi - echo 'default: $(SONAME)' >> config.mak + echo 'default: lib-shared' >> config.mak + echo 'install: install-lib-shared' >> config.mak fi +if [ "$static" = "yes" ]; then + echo 'default: lib-static' >> config.mak + echo 'install: install-lib-static' >> config.mak +fi + +if [ "$cli_libx264" = "system" ] ; then + if [ "$shared" = "yes" ]; then + CLI_LIBX264='$(SONAME)' + elif ${cross_prefix}pkg-config --exists x264 2>/dev/null; then + LDFLAGSCLI="$LDFLAGSCLI $(${cross_prefix}pkg-config --libs x264)" + CLI_LIBX264= + else + die "Can not find system libx264" + fi +else + CLI_LIBX264='$(LIBX264)' +fi +echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak +echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak + ./version.sh >> config.h pclibs="-L$libdir -lx264 $libpthread" @@ -849,7 +1067,12 @@ cat > conftest.log <<EOF Platform: $ARCH System: $SYS +cli: $cli +libx264: $cli_libx264 +shared: $shared +static: $static asm: $asm +interlaced: $interlaced avs: $avs lavf: $lavf ffms: $ffms @@ -859,8 +1082,8 @@ filters: $filters debug: $debug gprof: $gprof +strip: $strip PIC: $pic -shared: $shared visualize: $vis bit depth: $bit_depth EOF

@@ -26,8 +26,6 @@ *****************************************************************************/ #define _ISOC99_SOURCE -#include <math.h> -#include <unistd.h> #include "common/common.h" #include "macroblock.h" @@ -138,7 +136,8 @@ } x264_mb_analysis_t; /* lambda = pow(2,qp/6-2) */ -const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = { +const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = +{ 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */ 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */ 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */ @@ -154,7 +153,8 @@ /* lambda2 = pow(lambda,2) * .9 * 256 */ /* Capped to avoid overflow */ -const int x264_lambda2_tab[QP_MAX_MAX+1] = { +const int x264_lambda2_tab[QP_MAX_MAX+1] = +{ 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */ 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */ 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */ @@ -168,14 +168,16 @@ 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */ }; -const uint8_t x264_exp2_lut[64] = { +const uint8_t x264_exp2_lut[64] = +{ 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102, 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170, 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250 }; -const float x264_log2_lut[128] = { +const float x264_log2_lut[128] = +{ 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682, 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987, 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840, @@ -195,13 +197,15 @@ }; /* Avoid an int/float conversion. */ -const float x264_log2_lz_lut[32] = { +const float x264_log2_lz_lut[32] = +{ 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; // should the intra and inter lambdas be different? // I'm just matching the behaviour of deadzone quant. -static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = { +static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = +{ // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS) { 46, 58, 73, 92, 117, 147, @@ -238,7 +242,8 @@ }; #define MAX_CHROMA_LAMBDA_OFFSET 36 -static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = { +static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = +{ 16, 20, 25, 32, 40, 50, 64, 80, 101, 128, 161, 203, 256, 322, 406, 512, 645, 812, @@ -249,16 +254,20 @@ }; /* TODO: calculate CABAC costs */ -static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = { +static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = +{ 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 }; -static const uint8_t i_mb_b16x8_cost_table[17] = { +static const uint8_t i_mb_b16x8_cost_table[17] = +{ 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9 }; -static const uint8_t i_sub_mb_b_cost_table[13] = { +static const uint8_t i_sub_mb_b_cost_table[13] = +{ 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1 }; -static const uint8_t i_sub_mb_p_cost_table[4] = { +static const uint8_t i_sub_mb_p_cost_table[4] = +{ 5, 3, 3, 1 }; @@ -267,7 +276,18 @@ static uint16_t x264_cost_ref[QP_MAX+1][3][33]; static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; -int x264_analyse_init_costs( x264_t *h, int qp ) +float *x264_analyse_prepare_costs( x264_t *h ) +{ + float *logs = x264_malloc( (2*4*2048+1)*sizeof(float) ); + if( !logs ) + return NULL; + logs[0] = 0.718f; + for( int i = 1; i <= 2*4*2048; i++ ) + logs[i] = log2f(i+1)*2 + 1.718f; + return logs; +} + +int x264_analyse_init_costs( x264_t *h, float *logs, int qp ) { int lambda = x264_lambda_tab[qp]; if( h->cost_mv[qp] ) @@ -278,7 +298,7 @@ for( int i = 0; i <= 2*4*2048; i++ ) { h->cost_mv[qp][-i] = - h->cost_mv[qp][i] = X264_MIN( lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f, (1<<16)-1 ); + h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 ); } x264_pthread_mutex_lock( &cost_ref_mutex ); for( int i = 0; i < 3; i++ ) @@ -320,7 +340,7 @@ { x264_frame_t *frame = h->fref[0][j]; int width = frame->i_width[0] + 2*PADH; - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int offset, height; pixel *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH; height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted; @@ -428,7 +448,7 @@ /* Calculate max allowed MV range */ #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 ) h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 ); - h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); + h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] ); h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] ); if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P ) @@ -441,15 +461,14 @@ } h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; - if( h->mb.i_mb_x == 0 ) + if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) ) { - int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff; - int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff; + int mb_y = h->mb.i_mb_y >> SLICE_MBAFF; int thread_mvy_range = i_fmv_range; if( h->i_thread_frames > 1 ) { - int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16; + int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16; int thresh = pix_y + h->param.analyse.i_mv_range_thread; for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- ) for( int j = 0; j < h->i_ref[i]; j++ ) @@ -460,19 +479,48 @@ if( h->param.b_deterministic ) thread_mvy_range = h->param.analyse.i_mv_range_thread; - if( h->mb.b_interlaced ) + if( PARAM_INTERLACED ) thread_mvy_range >>= 1; x264_analyse_weight_frame( h, pix_y + thread_mvy_range ); } - h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); - h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 ); - h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); - h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); - h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); - h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; - h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; + if( PARAM_INTERLACED ) + { + /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */ + for( int i = 0; i < 3; i++ ) + { + int j = i == 2; + mb_y = (h->mb.i_mb_y >> j) + (i == 1); + h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 ); + h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 ); + h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range ); + h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] ); + h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 ); + h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border; + h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border; + } + } + else + { + h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); + h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 ); + h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); + h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); + h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); + h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; + h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; + } + } + if( PARAM_INTERLACED ) + { + int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1; + h->mb.mv_min[1] = h->mb.mv_miny_row[i]; + h->mb.mv_max[1] = h->mb.mv_maxy_row[i]; + h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i]; + h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i]; + h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i]; + h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i]; } #undef CLIP_FMV @@ -516,7 +564,7 @@ { /* Always run in fast-intra mode for subme < 3 */ if( h->mb.i_subpel_refine > 2 && - ( IS_INTRA( h->mb.i_mb_type_left ) || + ( IS_INTRA( h->mb.i_mb_type_left[0] ) || IS_INTRA( h->mb.i_mb_type_top ) || IS_INTRA( h->mb.i_mb_type_topleft ) || IS_INTRA( h->mb.i_mb_type_topright ) || @@ -1296,7 +1344,7 @@ /* early termination: if 16x16 chose ref 0, then evalute no refs older * than those used by the neighbors */ if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) && - h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 ) + h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 ) { i_maxref = 0; CHECK_NEIGHBOUR( -8 - 1 ); @@ -1565,7 +1613,7 @@ const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride; const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE; const int i_ref = a->l0.me8x8[i8x8].i_ref; - const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mvy_offset = MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; x264_weight_t *weight = h->sh.weight[i_ref]; // FIXME weight can be done on 4x4 blocks even if mc is smaller @@ -1722,8 +1770,8 @@ #define COST_BI_CHROMA( m0, m1, width, height ) \ { \ - l0_mvy_offset = h->mb.b_interlaced & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ - l1_mvy_offset = h->mb.b_interlaced & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ + l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ + l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ h->mc.mc_chroma( pix[0], pix[1], 8, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \ h->mc.mc_chroma( pix[2], pix[3], 8, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \ h->mc.avg[i_pixel+3]( bi[0], 8, pix[0], 8, pix[2], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ @@ -1907,18 +1955,18 @@ ALIGNED_ARRAY_16( pixel, pixuv, [2],[8*FENC_STRIDE] ); ALIGNED_ARRAY_16( pixel, bi, [8*FENC_STRIDE] ); - if( h->mb.b_interlaced & a->l0.bi16x16.i_ref ) + if( MB_INTERLACED & a->l0.bi16x16.i_ref ) { - int l0_mvy_offset = h->mb.b_interlaced & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 ); } else h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] ); - if( h->mb.b_interlaced & a->l1.bi16x16.i_ref ) + if( MB_INTERLACED & a->l1.bi16x16.i_ref ) { - int l1_mvy_offset = h->mb.b_interlaced & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 ); } @@ -2063,7 +2111,7 @@ { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 && - h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 ) + h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 ) { i_maxref[l] = 0; CHECK_NEIGHBOUR( -8 - 1 ); @@ -2809,15 +2857,19 @@ } else { + int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1]; + /* If the current macroblock is off the frame, just skip it. */ + if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid ) + b_skip = 1; /* Fast P_SKIP detection */ - if( h->param.analyse.b_fast_pskip ) + else if( h->param.analyse.b_fast_pskip ) { - if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] ) + if( skip_invalid ) // FIXME don't need to check this if the reference frame is done {} else if( h->param.analyse.i_subpel_refine >= 3 ) analysis.b_try_skip = 1; - else if( h->mb.i_mb_type_left == P_SKIP || + else if( h->mb.i_mb_type_left[0] == P_SKIP || h->mb.i_mb_type_top == P_SKIP || h->mb.i_mb_type_topleft == P_SKIP || h->mb.i_mb_type_topright == P_SKIP ) @@ -3139,7 +3191,10 @@ { if( !h->mb.b_direct_auto_write ) x264_mb_mc( h ); - if( analysis.i_mbrd ) + /* If the current macroblock is off the frame, just skip it. */ + if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height ) + b_skip = 1; + else if( analysis.i_mbrd ) { i_bskip_cost = ssd_mb( h ); /* 6 = minimum cavlc cost of a non-skipped MB */ @@ -3657,8 +3712,8 @@ int ref = h->mb.cache.ref[l][x264_scan8[0]]; if( ref < 0 ) continue; - completed = h->fref[l][ ref >> h->mb.b_interlaced ]->orig->i_lines_completed; - if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed ) + completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed; + if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed ) { x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n"); x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);

@@ -27,7 +27,8 @@ #ifndef X264_ANALYSE_H #define X264_ANALYSE_H -int x264_analyse_init_costs( x264_t *h, int qp ); +float *x264_analyse_prepare_costs( x264_t *h ); +int x264_analyse_init_costs( x264_t *h, float *logs, int qp ); void x264_analyse_free_costs( x264_t *h ); void x264_analyse_weight_frame( x264_t *h, int end ); void x264_macroblock_analyse( x264_t *h );

@@ -66,20 +66,36 @@ } } +#if !RDO_SKIP_BS +static void x264_cabac_field_decoding_flag( x264_t *h, x264_cabac_t *cb ) +{ + int ctx = 0; + ctx += h->mb.field_decoding_flag & !!h->mb.i_mb_x; + ctx += (h->mb.i_mb_top_mbpair_xy >= 0 + && h->mb.slice_table[h->mb.i_mb_top_mbpair_xy] == h->sh.i_first_mb + && h->mb.field[h->mb.i_mb_top_mbpair_xy]); + + x264_cabac_encode_decision_noup( cb, 70 + ctx, MB_INTERLACED ); + h->mb.field_decoding_flag = MB_INTERLACED; +} +#endif + static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb ) { const int i_mb_type = h->mb.i_type; - if( h->sh.b_mbaff && +#if !RDO_SKIP_BS + if( SLICE_MBAFF && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) { - x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced ); + x264_cabac_field_decoding_flag( h, cb ); } +#endif if( h->sh.i_type == SLICE_TYPE_I ) { int ctx = 0; - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != I_4x4 ) + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != I_4x4 ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 ) ctx++; @@ -113,7 +129,7 @@ else //if( h->sh.i_type == SLICE_TYPE_B ) { int ctx = 0; - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT ) + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != B_SKIP && h->mb.i_mb_type_left[0] != B_DIRECT ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) ctx++; @@ -198,7 +214,7 @@ int ctx = 0; /* No need to test for I4x4 or I_16x16 as cache_save handle that */ - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy] != 0 ) + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 ) ctx++; @@ -280,9 +296,9 @@ #if !RDO_SKIP_BS void x264_cabac_mb_skip( x264_t *h, int b_skip ) { - int ctx = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left )) - + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top )) - + (h->sh.i_type == SLICE_TYPE_P ? 11 : 24); + int ctx = h->mb.cache.i_neighbour_skip + 11; + if( h->sh.i_type != SLICE_TYPE_P ) + ctx += 13; x264_cabac_encode_decision( &h->cabac, ctx, b_skip ); } #endif @@ -335,7 +351,7 @@ const int i8 = x264_scan8[idx]; const int i_refa = h->mb.cache.ref[i_list][i8 - 1]; const int i_refb = h->mb.cache.ref[i_list][i8 - 8]; - int ctx = 0; + int ctx = 0; if( i_refa > 0 && !h->mb.cache.skip[i8 - 1] ) ctx++; @@ -365,7 +381,7 @@ for( int i = 1; i < i_abs; i++ ) x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 ); x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 ); - x264_cabac_encode_bypass( cb, mvd < 0 ); + x264_cabac_encode_bypass( cb, mvd >> 31 ); } else { @@ -405,12 +421,12 @@ x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); } - x264_cabac_encode_bypass( cb, mvd < 0 ); + x264_cabac_encode_bypass( cb, mvd >> 31 ); } #endif - /* Since we don't need to keep track of MVDs larger than 33, just cap the value. + /* Since we don't need to keep track of MVDs larger than 66, just cap the value. * This lets us store MVDs as 8-bit values instead of 16-bit. */ - return X264_MIN( i_abs, 33 ); + return X264_MIN( i_abs, 66 ); } static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) @@ -510,11 +526,13 @@ } -static const uint16_t significant_coeff_flag_offset[2][6] = { +static const uint16_t significant_coeff_flag_offset[2][6] = +{ { 105, 120, 134, 149, 152, 402 }, { 277, 292, 306, 321, 324, 436 } }; -static const uint16_t last_coeff_flag_offset[2][6] = { +static const uint16_t last_coeff_flag_offset[2][6] = +{ { 166, 181, 195, 210, 213, 417 }, { 338, 353, 367, 382, 385, 451 } }; @@ -532,7 +550,8 @@ 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }}; -static const uint8_t last_coeff_flag_offset_8x8[63] = { +static const uint8_t last_coeff_flag_offset_8x8[63] = +{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, @@ -556,78 +575,70 @@ #if !RDO_SKIP_BS static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; - const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; - int i_coeff_abs_m1[64]; - int i_coeff_sign[64]; - int i_coeff = 0; - int i_last; - int node_ctx = 0; - int i = 0; + const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; + int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int coeff_idx = -1, node_ctx = 0, last; + int coeffs[64]; - i_last = h->quantf.coeff_last[ctx_block_cat](l); + last = h->quantf.coeff_last[ctx_block_cat]( l ); #define WRITE_SIGMAP( l8x8 )\ - while(1)\ + int i = 0;\ + while( 1 )\ {\ if( l[i] )\ {\ - i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\ - i_coeff_sign[i_coeff] = l[i] < 0;\ - i_coeff++;\ - x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\ - if( i == i_last )\ + coeffs[++coeff_idx] = l[i];\ + x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\ + if( i == last )\ {\ - x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\ + x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\ break;\ }\ else\ - x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\ + x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\ }\ else\ - x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\ + x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\ i++;\ - if( i == i_count_m1 )\ + if( i == count_m1 )\ {\ - i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\ - i_coeff_sign[i_coeff] = l[i] < 0;\ - i_coeff++;\ + coeffs[++coeff_idx] = l[i];\ break;\ }\ } if( ctx_block_cat == DCT_LUMA_8x8 ) { - const int i_count_m1 = 63; + int count_m1 = 63; WRITE_SIGMAP( 1 ) } else { - const int i_count_m1 = count_cat_m1[ctx_block_cat]; + int count_m1 = count_cat_m1[ctx_block_cat]; WRITE_SIGMAP( 0 ) } do { - int i_prefix, ctx; - i_coeff--; - /* write coeff_abs - 1 */ - i_prefix = X264_MIN( i_coeff_abs_m1[i_coeff], 14 ); - ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; + int coeff = coeffs[coeff_idx]; + int abs_coeff = abs(coeff); + int coeff_sign = coeff >> 31; + int ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; - if( i_prefix ) + if( abs_coeff > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; - for( i = 0; i < i_prefix - 1; i++ ) + ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level; + for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- ) x264_cabac_encode_decision( cb, ctx, 1 ); - if( i_prefix < 14 ) + if( abs_coeff < 15 ) x264_cabac_encode_decision( cb, ctx, 0 ); else - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 ); + x264_cabac_encode_ue_bypass( cb, 0, abs_coeff - 15 ); node_ctx = coeff_abs_level_transition[1][node_ctx]; } @@ -637,8 +648,8 @@ node_ctx = coeff_abs_level_transition[0][node_ctx]; } - x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] ); - } while( i_coeff > 0 ); + x264_cabac_encode_bypass( cb, coeff_sign ); + } while( --coeff_idx >= 0 ); } #define block_residual_write_cabac_8x8( h, cb, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, l ) @@ -650,37 +661,35 @@ * for this (~0.001db) and the speed boost (~30%) is worth it. */ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8 ) { - const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; - const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; - int i_last, i_coeff_abs, ctx, node_ctx; - - i_last = h->quantf.coeff_last[ctx_block_cat](l); - - i_coeff_abs = abs(l[i_last]); - ctx = coeff_abs_level1_ctx[0] + i_ctx_level; + const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; + int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int last = h->quantf.coeff_last[ctx_block_cat]( l ); + int coeff_abs = abs(l[last]); + int ctx = coeff_abs_level1_ctx[0] + ctx_level; + int node_ctx; - if( i_last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) ) + if( last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) ) { - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?sig_offset[i_last]:i_last), 1 ); - x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : last), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : last), 1 ); } - if( i_coeff_abs > 1 ) + if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level; - if( i_coeff_abs < 15 ) + ctx = coeff_abs_levelgt1_ctx[0] + ctx_level; + if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[i_coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs - 15 ); + x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][0]; } @@ -691,29 +700,29 @@ x264_cabac_encode_bypass( cb, 0 ); // sign } - for( int i = i_last-1 ; i >= 0; i-- ) + for( int i = last-1 ; i >= 0; i-- ) { if( l[i] ) { - i_coeff_abs = abs(l[i]); - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?sig_offset[i]:i), 1 ); - x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 ); - ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; + coeff_abs = abs(l[i]); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 ); + ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; - if( i_coeff_abs > 1 ) + if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; - if( i_coeff_abs < 15 ) + ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level; + if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[i_coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs - 15 ); + x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][node_ctx]; } @@ -725,7 +734,7 @@ } } else - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?sig_offset[i]:i), 0 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 0 ); } }

@@ -100,9 +100,8 @@ /* Weight highly against overflows. */ s->i_bits_encoded += 2000; #else - x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code ); - /* clip level, preserving sign */ - i_level_code = (1<<12) - 2 + (i_level_code & 1); + /* We've had an overflow; note it down and re-encode the MB later. */ + h->mb.b_overflow = 1; #endif } } @@ -296,10 +295,10 @@ int i_mb_pos_tex; #endif - if( h->sh.b_mbaff + if( SLICE_MBAFF && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) { - bs_write1( s, h->mb.b_interlaced ); + bs_write1( s, MB_INTERLACED ); } #if !RDO_SKIP_BS

@@ -25,8 +25,6 @@ * For more information, contact us at licensing@x264.com. *****************************************************************************/ -#include <math.h> - #include "common/common.h" #include "set.h" @@ -104,7 +102,7 @@ sh->i_frame_num = i_frame; - sh->b_mbaff = h->param.b_interlaced; + sh->b_mbaff = PARAM_INTERLACED; sh->b_field_pic = 0; /* no field support for now */ sh->b_bottom_field = 0; /* not yet used */ @@ -183,8 +181,10 @@ { if( sh->b_mbaff ) { - assert( sh->i_first_mb % (2*sh->sps->i_mb_width) == 0 ); - bs_write_ue( s, sh->i_first_mb >> 1 ); + int first_x = sh->i_first_mb % sh->sps->i_mb_width; + int first_y = sh->i_first_mb / sh->sps->i_mb_width; + assert( (first_y&1) == 0 ); + bs_write_ue( s, (2*first_x + sh->sps->i_mb_width*(first_y&~1) + (first_y&1)) >> 1 ); } else bs_write_ue( s, sh->i_first_mb ); @@ -335,8 +335,9 @@ static int x264_bitstream_check_buffer( x264_t *h ) { uint8_t *bs_bak = h->out.p_bitstream; - if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < 2500)) || - (h->out.bs.p_end - h->out.bs.p < 2500) ) + int max_mb_size = 2500 << SLICE_MBAFF; + if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < max_mb_size)) || + (h->out.bs.p_end - h->out.bs.p < max_mb_size) ) { h->out.i_bitstream += 100000; CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream ); @@ -383,15 +384,15 @@ * ****************************************************************************/ -static int x264_validate_parameters( x264_t *h ) +static int x264_validate_parameters( x264_t *h, int b_open ) { #if HAVE_MMX #ifdef __SSE__ - if( !(x264_cpu_detect() & X264_CPU_SSE) ) + if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n"); #else - if( !(x264_cpu_detect() & X264_CPU_MMXEXT) ) + if( b_open && !(x264_cpu_detect() & X264_CPU_MMXEXT) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n"); #endif @@ -419,6 +420,16 @@ return -1; } +#if HAVE_INTERLACED + h->param.b_interlaced = !!PARAM_INTERLACED; +#else + if( h->param.b_interlaced ) + { + x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" ); + return -1; + } +#endif + if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width || (h->param.crop_rect.i_top + h->param.crop_rect.i_bottom) >= h->param.i_height ) { @@ -457,23 +468,10 @@ h->param.analyse.i_weighted_pred = 0; } - if( h->param.b_interlaced ) - { - if( h->param.analyse.i_me_method >= X264_ME_ESA ) - { - x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" ); - h->param.analyse.i_me_method = X264_ME_UMH; - } - if( h->param.analyse.i_weighted_pred > 0 ) - { - x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" ); - h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE; - } - } - h->param.i_frame_packing = x264_clip3( h->param.i_frame_packing, -1, 5 ); /* Detect default ffmpeg settings and terminate with an error. */ + if( b_open ) { int score = 0; score += h->param.analyse.i_me_range == 0; @@ -502,7 +500,11 @@ return -1; } h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, -QP_BD_OFFSET, 51 ); + h->param.rc.f_rf_constant_max = x264_clip3f( h->param.rc.f_rf_constant_max, -QP_BD_OFFSET, 51 ); h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX ); + h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 10 ); + h->param.rc.f_ip_factor = X264_MAX( h->param.rc.f_ip_factor, 0.01f ); + h->param.rc.f_pb_factor = X264_MAX( h->param.rc.f_pb_factor, 0.01f ); if( h->param.rc.i_rc_method == X264_RC_CRF ) { h->param.rc.i_qp_constant = h->param.rc.f_rf_constant + QP_BD_OFFSET; @@ -538,9 +540,15 @@ h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX ); h->param.rc.i_aq_mode = 0; h->param.rc.b_mb_tree = 0; + h->param.rc.i_bitrate = 0; } h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX ); h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max ); + h->param.rc.i_qp_step = x264_clip3( h->param.rc.i_qp_step, 0, QP_MAX ); + h->param.rc.i_bitrate = x264_clip3( h->param.rc.i_bitrate, 0, 2000000 ); + h->param.rc.i_vbv_buffer_size = x264_clip3( h->param.rc.i_vbv_buffer_size, 0, 2000000 ); + h->param.rc.i_vbv_max_bitrate = x264_clip3( h->param.rc.i_vbv_max_bitrate, 0, 2000000 ); + h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init, 0, 2000000 ); if( h->param.rc.i_vbv_buffer_size ) { if( h->param.rc.i_rc_method == X264_RC_CQP ) @@ -575,49 +583,58 @@ h->param.rc.i_vbv_max_bitrate = 0; } - if( h->param.b_interlaced && h->param.i_slice_max_size ) - { - x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" ); - h->param.i_slice_max_size = 0; - } - if( h->param.b_interlaced && h->param.i_slice_max_mbs ) - { - x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" ); - h->param.i_slice_max_mbs = 0; - } - int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced); + h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); + h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); + + int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED); if( h->param.b_sliced_threads ) h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices ); else { h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices ); - h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); - h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); if( h->param.i_slice_max_mbs || h->param.i_slice_max_size ) h->param.i_slice_count = 0; } + if( h->param.b_bluray_compat ) + { + h->param.i_bframe_pyramid = X264_MIN( X264_B_PYRAMID_STRICT, h->param.i_bframe_pyramid ); + h->param.i_bframe = X264_MIN( h->param.i_bframe, 3 ); + h->param.b_aud = 1; + h->param.i_nal_hrd = X264_MAX( h->param.i_nal_hrd, X264_NAL_HRD_VBR ); + h->param.i_slice_max_size = 0; + h->param.i_slice_max_mbs = 0; + h->param.b_intra_refresh = 0; + h->param.i_frame_reference = X264_MIN( h->param.i_frame_reference, 6 ); + h->param.i_dpb_size = X264_MIN( h->param.i_dpb_size, 6 ); + /* Due to the proliferation of broken players that don't handle dupes properly. */ + h->param.analyse.i_weighted_pred = X264_MIN( h->param.analyse.i_weighted_pred, X264_WEIGHTP_SIMPLE ); + if( h->param.b_fake_interlaced ) + h->param.b_pic_struct = 1; + } + h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, X264_REF_MAX ); h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, X264_REF_MAX ); if( h->param.i_scenecut_threshold < 0 ) h->param.i_scenecut_threshold = 0; + h->param.analyse.i_direct_mv_pred = x264_clip3( h->param.analyse.i_direct_mv_pred, X264_DIRECT_PRED_NONE, X264_DIRECT_PRED_AUTO ); if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL ) { x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" ); h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; } h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_MIN( X264_BFRAME_MAX, h->param.i_keyint_max-1 ) ); - h->param.i_open_gop = x264_clip3( h->param.i_open_gop, X264_OPEN_GOP_NONE, X264_OPEN_GOP_BLURAY ); h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 ); if( h->param.i_bframe <= 1 ) h->param.i_bframe_pyramid = X264_B_PYRAMID_NONE; h->param.i_bframe_pyramid = x264_clip3( h->param.i_bframe_pyramid, X264_B_PYRAMID_NONE, X264_B_PYRAMID_NORMAL ); + h->param.i_bframe_adaptive = x264_clip3( h->param.i_bframe_adaptive, X264_B_ADAPT_NONE, X264_B_ADAPT_TRELLIS ); if( !h->param.i_bframe ) { h->param.i_bframe_adaptive = X264_B_ADAPT_NONE; h->param.analyse.i_direct_mv_pred = 0; h->param.analyse.b_weighted_bipred = 0; - h->param.i_open_gop = X264_OPEN_GOP_NONE; + h->param.b_open_gop = 0; } if( h->param.b_intra_refresh && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL ) { @@ -630,10 +647,10 @@ h->param.i_frame_reference = 1; h->param.i_dpb_size = 1; } - if( h->param.b_intra_refresh && h->param.i_open_gop ) + if( h->param.b_intra_refresh && h->param.b_open_gop ) { x264_log( h, X264_LOG_WARNING, "intra-refresh is not compatible with open-gop\n" ); - h->param.i_open_gop = X264_OPEN_GOP_NONE; + h->param.b_open_gop = 0; } float fps = h->param.i_fps_num > 0 && h->param.i_fps_den > 0 ? (float) h->param.i_fps_num / h->param.i_fps_den : 25.0; if( h->param.i_keyint_min == X264_KEYINT_MIN_AUTO ) @@ -686,14 +703,12 @@ if( h->param.analyse.i_me_method < X264_ME_DIA || h->param.analyse.i_me_method > X264_ME_TESA ) h->param.analyse.i_me_method = X264_ME_HEX; - if( h->param.analyse.i_me_range < 4 ) - h->param.analyse.i_me_range = 4; + h->param.analyse.i_me_range = x264_clip3( h->param.analyse.i_me_range, 4, 1024 ); if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX ) h->param.analyse.i_me_range = 16; if( h->param.analyse.i_me_method == X264_ME_TESA && (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) ) h->param.analyse.i_me_method = X264_ME_ESA; - h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 10 ); h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1; h->param.analyse.inter &= X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8|X264_ANALYSE_BSUB16x16| X264_ANALYSE_I4x4|X264_ANALYSE_I8x8; @@ -707,33 +722,57 @@ } h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); + h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); + h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 ); + if( h->param.rc.f_aq_strength == 0 ) + h->param.rc.i_aq_mode = 0; + + if( h->param.i_log_level < X264_LOG_INFO ) + { + h->param.analyse.b_psnr = 0; + h->param.analyse.b_ssim = 0; + } + /* Warn users trying to measure PSNR/SSIM with psy opts on. */ + if( b_open && (h->param.analyse.b_psnr || h->param.analyse.b_ssim) ) + { + char *s = NULL; + + if( h->param.analyse.b_psy ) + { + s = h->param.analyse.b_psnr ? "psnr" : "ssim"; + x264_log( h, X264_LOG_WARNING, "--%s used with psy on: results will be invalid!\n", s ); + } + else if( !h->param.rc.i_aq_mode && h->param.analyse.b_ssim ) + { + x264_log( h, X264_LOG_WARNING, "--ssim used with AQ off: results will be invalid!\n" ); + s = "ssim"; + } + else if( h->param.rc.i_aq_mode && h->param.analyse.b_psnr ) + { + x264_log( h, X264_LOG_WARNING, "--psnr used with AQ on: results will be invalid!\n" ); + s = "psnr"; + } + if( s ) + x264_log( h, X264_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s ); + } + if( !h->param.analyse.b_psy ) { h->param.analyse.f_psy_rd = 0; h->param.analyse.f_psy_trellis = 0; } - if( !h->param.analyse.i_trellis ) - h->param.analyse.f_psy_trellis = 0; h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 ); h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 ); - if( h->param.analyse.i_subpel_refine < 6 ) - h->param.analyse.f_psy_rd = 0; - h->mb.i_psy_rd = FIX8( h->param.analyse.f_psy_rd ); + h->mb.i_psy_rd = h->param.analyse.i_subpel_refine >= 6 ? FIX8( h->param.analyse.f_psy_rd ) : 0; + h->mb.i_psy_trellis = h->param.analyse.i_trellis ? FIX8( h->param.analyse.f_psy_trellis / 4 ) : 0; /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */ /* so we lower the chroma QP offset to compensate */ - /* This can be triggered repeatedly on multiple calls to parameter_validate, but since encoding - * uses the pps chroma qp offset not the param chroma qp offset, this is not a problem. */ - if( h->mb.i_psy_rd ) + if( b_open && h->mb.i_psy_rd ) h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2; - h->mb.i_psy_trellis = FIX8( h->param.analyse.f_psy_trellis / 4 ); /* Psy trellis has a similar effect. */ - if( h->mb.i_psy_trellis ) + if( b_open && h->mb.i_psy_trellis ) h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2; h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); - h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); - h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 ); - if( h->param.rc.f_aq_strength == 0 ) - h->param.rc.i_aq_mode = 0; /* MB-tree requires AQ to be on, even if the strength is zero. */ if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree ) { @@ -768,12 +807,27 @@ } } if( h->param.analyse.i_mv_range <= 0 ) - h->param.analyse.i_mv_range = l->mv_range >> h->param.b_interlaced; + h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED; else - h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> h->param.b_interlaced); + h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED); } h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); + + if( PARAM_INTERLACED ) + { + if( h->param.analyse.i_me_method >= X264_ME_ESA ) + { + x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" ); + h->param.analyse.i_me_method = X264_ME_UMH; + } + if( h->param.analyse.i_weighted_pred > 0 ) + { + x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" ); + h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE; + } + } + if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy ) h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE; @@ -800,6 +854,8 @@ h->param.analyse.i_mv_range_thread = r2; } + if( h->param.rc.f_rate_tolerance < 0 ) + h->param.rc.f_rate_tolerance = 0; if( h->param.rc.f_qblur < 0 ) h->param.rc.f_qblur = 0; if( h->param.rc.f_complexity_blur < 0 ) @@ -807,15 +863,11 @@ h->param.i_sps_id &= 31; - if( h->param.i_log_level < X264_LOG_INFO ) - { - h->param.analyse.b_psnr = 0; - h->param.analyse.b_ssim = 0; - } - - if( h->param.b_interlaced ) + if( PARAM_INTERLACED ) h->param.b_pic_struct = 1; + h->param.i_nal_hrd = x264_clip3( h->param.i_nal_hrd, X264_NAL_HRD_NONE, X264_NAL_HRD_CBR ); + if( h->param.i_nal_hrd && !h->param.rc.i_vbv_buffer_size ) { x264_log( h, X264_LOG_WARNING, "NAL HRD parameters require VBV parameters\n" ); @@ -843,8 +895,12 @@ BOOLIFY( b_repeat_headers ); BOOLIFY( b_annexb ); BOOLIFY( b_vfr_input ); + BOOLIFY( b_pulldown ); + BOOLIFY( b_tff ); BOOLIFY( b_pic_struct ); BOOLIFY( b_fake_interlaced ); + BOOLIFY( b_open_gop ); + BOOLIFY( b_bluray_compat ); BOOLIFY( analyse.b_transform_8x8 ); BOOLIFY( analyse.b_weighted_bipred ); BOOLIFY( analyse.b_chroma_me ); @@ -937,7 +993,7 @@ goto fail; } - if( x264_validate_parameters( h ) < 0 ) + if( x264_validate_parameters( h, 1 ) < 0 ) goto fail; if( h->param.psz_cqm_file ) @@ -981,6 +1037,10 @@ h->mb.i_mb_width = h->sps->i_mb_width; h->mb.i_mb_height = h->sps->i_mb_height; h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height; + /* Adaptive MBAFF and subme 0 are not supported as we require halving motion + * vectors during prediction, resulting in hpel mvs. + * The chosen solution is to make MBAFF non-adaptive in this case. */ + h->mb.b_adaptive_mbaff = PARAM_INTERLACED && h->param.analyse.i_subpel_refine; /* Init frames. */ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && !h->param.rc.b_stat_read ) @@ -1032,14 +1092,17 @@ x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); - if( !h->param.b_cabac ) - x264_init_vlc_tables(); + if( h->param.b_cabac ) + x264_cabac_init(); + else + x264_cavlc_init(); x264_pixel_init( h->param.cpu, &h->pixf ); x264_dct_init( h->param.cpu, &h->dctf ); - x264_zigzag_init( h->param.cpu, &h->zigzagf, h->param.b_interlaced ); + x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced ); + memcpy( &h->zigzagf, PARAM_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); x264_mc_init( h->param.cpu, &h->mc ); x264_quant_init( h, h->param.cpu, &h->quantf ); - x264_deblock_init( h->param.cpu, &h->loopf ); + x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED ); x264_bitstream_init( h->param.cpu, &h->bsf ); x264_dct_init_weights(); @@ -1065,11 +1128,15 @@ p += sprintf( p, " none!" ); x264_log( h, X264_LOG_INFO, "%s\n", buf ); + float *logs = x264_analyse_prepare_costs( h ); + if( !logs ) + goto fail; for( qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ ) - if( x264_analyse_init_costs( h, qp ) ) + if( x264_analyse_init_costs( h, logs, qp ) ) goto fail; - if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) ) + if( x264_analyse_init_costs( h, logs, X264_LOOKAHEAD_QP ) ) goto fail; + x264_free( logs ); static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 }; /* Checks for known miscompilation issues. */ @@ -1096,8 +1163,8 @@ * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min ) : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor ))); - CHECKED_MALLOC( h->nal_buffer, h->out.i_bitstream * 3/2 + 4 ); h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4; + CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size ); if( h->param.i_threads > 1 && x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) ) @@ -1248,27 +1315,22 @@ if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 && param->rc.i_vbv_max_bitrate > 0 && param->rc.i_vbv_buffer_size > 0 ) { + rc_reconfig |= h->param.rc.i_vbv_max_bitrate != param->rc.i_vbv_max_bitrate; + rc_reconfig |= h->param.rc.i_vbv_buffer_size != param->rc.i_vbv_buffer_size; + rc_reconfig |= h->param.rc.i_bitrate != param->rc.i_bitrate; COPY( rc.i_vbv_max_bitrate ); COPY( rc.i_vbv_buffer_size ); COPY( rc.i_bitrate ); - rc_reconfig = 1; } - if( h->param.rc.f_rf_constant != param->rc.f_rf_constant ) - { - COPY( rc.f_rf_constant ); - rc_reconfig = 1; - } - if( h->param.rc.f_rf_constant_max != param->rc.f_rf_constant_max ) - { - COPY( rc.f_rf_constant_max ); - rc_reconfig = 1; - } - + rc_reconfig |= h->param.rc.f_rf_constant != param->rc.f_rf_constant; + rc_reconfig |= h->param.rc.f_rf_constant_max != param->rc.f_rf_constant_max; + COPY( rc.f_rf_constant ); + COPY( rc.f_rf_constant_max ); #undef COPY mbcmp_init( h ); - int ret = x264_validate_parameters( h ); + int ret = x264_validate_parameters( h, 0 ); /* Supported reconfiguration options (1-pass only): * vbv-maxrate @@ -1347,9 +1409,11 @@ nal_size += h->out.nal[i].i_payload; /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */ - if( h->nal_buffer_size < nal_size * 3/2 + h->out.i_nal * 4 ) + int necessary_size = nal_size * 3/2 + h->out.i_nal * 4; + if( h->nal_buffer_size < necessary_size ) { - uint8_t *buf = x264_malloc( nal_size * 2 + h->out.i_nal * 4 ); + h->nal_buffer_size = necessary_size * 2; + uint8_t *buf = x264_malloc( h->nal_buffer_size ); if( !buf ) return -1; if( previous_nal_size ) @@ -1404,6 +1468,8 @@ return -1; frame_size = x264_encoder_encapsulate_nals( h, 0 ); + if( frame_size < 0 ) + return -1; /* now set output*/ *pi_nal = h->out.i_nal; @@ -1489,7 +1555,7 @@ // and duplicates of that frame. h->fenc->i_lines_weighted = 0; - for( int i_ref = 0; i_ref < (h->i_ref[0] << h->sh.b_mbaff); i_ref++ ) + for( int i_ref = 0; i_ref < (h->i_ref[0] << SLICE_MBAFF); i_ref++ ) for( int i = 0; i < 3; i++ ) h->sh.weight[i_ref][i].weightfn = NULL; @@ -1497,7 +1563,7 @@ if( h->sh.i_type != SLICE_TYPE_P || h->param.analyse.i_weighted_pred <= 0 ) return; - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int denom = -1; int weightplane[2] = { 0, 0 }; int buffer_next = 0; @@ -1628,6 +1694,10 @@ h->i_ref[0] = X264_MIN( h->i_ref[0], h->frames.i_max_ref0 ); h->i_ref[0] = X264_MIN( h->i_ref[0], h->param.i_frame_reference ); // if reconfig() has lowered the limit + /* For Blu-ray compliance, don't reference frames outside of the minigop. */ + if( IS_X264_TYPE_B( h->fenc->i_type ) && h->param.b_bluray_compat ) + h->i_ref[0] = X264_MIN( h->i_ref[0], IS_X264_TYPE_B( h->fref[0][0]->i_type ) + 1 ); + /* add duplicates */ if( h->fenc->i_type == X264_TYPE_P ) { @@ -1676,24 +1746,37 @@ int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_end = mb_y == h->i_threadslice_end; int b_measure_quality = 1; - int min_y = mb_y - (1 << h->sh.b_mbaff); + int min_y = mb_y - (1 << SLICE_MBAFF); int b_start = min_y == h->i_threadslice_start; - int max_y = b_end ? h->i_threadslice_end : mb_y; + /* Even in interlaced mode, deblocking never modifies more than 4 pixels + * above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. */ + int minpix_y = min_y*16 - 4 * !b_start; + int maxpix_y = mb_y*16 - 4 * !b_end; b_deblock &= b_hpel || h->param.psz_dump_yuv; if( h->param.b_sliced_threads && b_start && min_y && !b_inloop ) { b_deblock = 0; /* We already deblocked on the inloop pass. */ b_measure_quality = 0; /* We already measured quality on the inloop pass. */ } - if( mb_y & h->sh.b_mbaff ) + if( mb_y & SLICE_MBAFF ) return; if( min_y < h->i_threadslice_start ) return; if( b_deblock ) - for( int y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) ) + for( int y = min_y; y < mb_y; y += (1 << SLICE_MBAFF) ) x264_frame_deblock_row( h, y ); + /* FIXME: Prediction requires different borders for interlaced/progressive mc, + * but the actual image data is equivalent. For now, maintain this + * consistency by copying deblocked pixels between planes. */ + if( PARAM_INTERLACED ) + for( int p = 0; p < 2; p++ ) + for( int i = minpix_y>>p; i < maxpix_y>>p; i++ ) + memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p], + h->fdec->plane[p] + i*h->fdec->i_stride[p], + h->mb.i_mb_width*16*sizeof(pixel) ); + if( b_hpel ) { int end = mb_y == h->mb.i_mb_height; @@ -1705,25 +1788,30 @@ } } - if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref ) - x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) ); + if( SLICE_MBAFF ) + for( int i = 0; i < 2; i++ ) + { + XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] ); + XCHG( pixel *, h->intra_border_backup[1][i], h->intra_border_backup[4][i] ); + } - min_y = min_y*16 - 8 * !b_start; - max_y = b_end ? X264_MIN( h->i_threadslice_end*16 , h->param.i_height ) : mb_y*16 - 8; + if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref ) + x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << SLICE_MBAFF)) ); if( b_measure_quality ) { + maxpix_y = X264_MIN( maxpix_y, h->param.i_height ); if( h->param.analyse.b_psnr ) { uint64_t ssd_y = x264_pixel_ssd_wxh( &h->pixf, - h->fdec->plane[0] + min_y * h->fdec->i_stride[0], h->fdec->i_stride[0], - h->fenc->plane[0] + min_y * h->fenc->i_stride[0], h->fenc->i_stride[0], - h->param.i_width, max_y-min_y ); + h->fdec->plane[0] + minpix_y * h->fdec->i_stride[0], h->fdec->i_stride[0], + h->fenc->plane[0] + minpix_y * h->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width, maxpix_y-minpix_y ); uint64_t ssd_u, ssd_v; x264_pixel_ssd_nv12( &h->pixf, - h->fdec->plane[1] + (min_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], - h->fenc->plane[1] + (min_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], - h->param.i_width>>1, (max_y-min_y)>>1, &ssd_u, &ssd_v ); + h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], + h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], + h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v ); h->stat.frame.i_ssd[0] += ssd_y; h->stat.frame.i_ssd[1] += ssd_u; h->stat.frame.i_ssd[2] += ssd_v; @@ -1734,12 +1822,12 @@ x264_emms(); /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks, * and overlap by 4 */ - min_y += b_start ? 2 : -6; + minpix_y += b_start ? 2 : -6; h->stat.frame.f_ssim += x264_pixel_ssim_wxh( &h->pixf, - h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0], - h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0], - h->param.i_width-2, max_y-min_y, h->scratch_buffer ); + h->fdec->plane[0] + 2+minpix_y*h->fdec->i_stride[0], h->fdec->i_stride[0], + h->fenc->plane[0] + 2+minpix_y*h->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width-2, maxpix_y-minpix_y, h->scratch_buffer ); } } } @@ -1842,12 +1930,18 @@ } } + if( h->fenc->i_type == X264_TYPE_BREF && h->param.b_bluray_compat && h->sh.i_mmco_command_count ) + { + h->b_sh_backup = 1; + h->sh_backup = h->sh; + } + h->fdec->i_frame_num = h->sh.i_frame_num; if( h->sps->i_poc_type == 0 ) { h->sh.i_poc = h->fdec->i_poc; - if( h->param.b_interlaced ) + if( PARAM_INTERLACED ) { h->sh.i_delta_poc_bottom = h->param.b_tff ? 1 : -1; h->sh.i_poc += h->sh.i_delta_poc_bottom == -1; @@ -1885,6 +1979,7 @@ * other inaccuracies. */ int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 1 + h->param.b_cabac + 5; int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : 0; + int back_up_bitstream = slice_max_size || (!h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH); int starting_bits = bs_pos(&h->out.bs); int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_hpel = h->fdec->b_kept_as_ref; @@ -1923,53 +2018,78 @@ last_emu_check = h->out.bs.p; h->mb.i_last_qp = h->sh.i_qp; h->mb.i_last_dqp = 0; + h->mb.field_decoding_flag = 0; i_mb_y = h->sh.i_first_mb / h->mb.i_mb_width; i_mb_x = h->sh.i_first_mb % h->mb.i_mb_width; i_skip = 0; - while( (mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width) <= h->sh.i_last_mb ) + while( 1 ) { + mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width; int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac); - if( x264_bitstream_check_buffer( h ) ) - return -1; - - if( slice_max_size ) + if( !(i_mb_y & SLICE_MBAFF) ) { - mv_bits_bak = h->stat.frame.i_mv_bits; - tex_bits_bak = h->stat.frame.i_tex_bits; - /* We don't need the contexts because flushing the CABAC encoder has no context - * dependency and macroblocks are only re-encoded in the case where a slice is - * ended (and thus the content of all contexts are thrown away). */ - if( h->param.b_cabac ) - { - memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) ); - /* x264's CABAC writer modifies the previous byte during carry, so it has to be - * backed up. */ - cabac_prevbyte_bak = h->cabac.p[-1]; - } - else + if( x264_bitstream_check_buffer( h ) ) + return -1; + + if( back_up_bitstream ) { - bs_bak = h->out.bs; - i_skip_bak = i_skip; + mv_bits_bak = h->stat.frame.i_mv_bits; + tex_bits_bak = h->stat.frame.i_tex_bits; + /* We don't need the contexts because flushing the CABAC encoder has no context + * dependency and macroblocks are only re-encoded in the case where a slice is + * ended (and thus the content of all contexts are thrown away). */ + if( h->param.b_cabac ) + { + memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) ); + /* x264's CABAC writer modifies the previous byte during carry, so it has to be + * backed up. */ + cabac_prevbyte_bak = h->cabac.p[-1]; + } + else + { + bs_bak = h->out.bs; + i_skip_bak = i_skip; + } } } if( i_mb_x == 0 && !h->mb.b_reencode_mb ) x264_fdec_filter_row( h, i_mb_y, 1 ); + if( PARAM_INTERLACED ) + { + if( h->mb.b_adaptive_mbaff ) + { + if( !(i_mb_y&1) ) + { + /* FIXME: VSAD is fast but fairly poor at choosing the best interlace type. */ + h->mb.b_interlaced = x264_field_vsad( h, i_mb_x, i_mb_y ); + memcpy( &h->zigzagf, MB_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); + if( !MB_INTERLACED && (i_mb_y+2) == h->mb.i_mb_height ) + x264_expand_border_mbpair( h, i_mb_x, i_mb_y ); + } + } + h->mb.field[mb_xy] = MB_INTERLACED; + } + /* load cache */ - x264_macroblock_cache_load( h, i_mb_x, i_mb_y ); + if( SLICE_MBAFF ) + x264_macroblock_cache_load_interlaced( h, i_mb_x, i_mb_y ); + else + x264_macroblock_cache_load_progressive( h, i_mb_x, i_mb_y ); x264_macroblock_analyse( h ); /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */ +reencode: x264_macroblock_encode( h ); if( h->param.b_cabac ) { - if( mb_xy > h->sh.i_first_mb && !(h->sh.b_mbaff && (i_mb_y&1)) ) + if( mb_xy > h->sh.i_first_mb && !(SLICE_MBAFF && (i_mb_y&1)) ) x264_cabac_encode_terminal( &h->cabac ); if( IS_SKIP( h->mb.i_type ) ) @@ -1993,6 +2113,19 @@ i_skip = 0; } x264_macroblock_write_cavlc( h ); + /* If there was a CAVLC level code overflow, try again at a higher QP. */ + if( h->mb.b_overflow ) + { + h->mb.i_chroma_qp = h->chroma_qp_table[++h->mb.i_qp]; + h->mb.i_skip_intra = 0; + h->mb.b_skip_mc = 0; + h->mb.b_overflow = 0; + h->out.bs = bs_bak; + i_skip = i_skip_bak; + h->stat.frame.i_mv_bits = mv_bits_bak; + h->stat.frame.i_tex_bits = tex_bits_bak; + goto reencode; + } } } @@ -2030,7 +2163,16 @@ i_skip = i_skip_bak; } h->mb.b_reencode_mb = 1; - h->sh.i_last_mb = mb_xy-1; + if( SLICE_MBAFF ) + { + // set to bottom of previous mbpair + if( i_mb_x ) + h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1)); + else + h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1; + } + else + h->sh.i_last_mb = mb_xy-1; break; } else @@ -2055,9 +2197,10 @@ h->stat.frame.i_mb_count[h->mb.i_type]++; int b_intra = IS_INTRA( h->mb.i_type ); + int b_skip = IS_SKIP( h->mb.i_type ); if( h->param.i_log_level >= X264_LOG_INFO || h->param.rc.b_stat_write ) { - if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) ) + if( !b_intra && !b_skip && !IS_DIRECT( h->mb.i_type ) ) { if( h->mb.i_partition != D_8x8 ) h->stat.frame.i_mb_partition[h->mb.i_partition] += 4; @@ -2102,24 +2245,19 @@ h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++; h->stat.frame.i_mb_pred_mode[3][x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode]]++; } + h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED; } /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */ if( b_deblock ) - { - int mvy_limit = 4 >> h->sh.b_mbaff; - uint8_t (*bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; - x264_macroblock_cache_load_deblock( h ); - if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) ) - memset( bs, 3, 2*4*4*sizeof(uint8_t) ); - else - h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, - bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B ); - } + x264_macroblock_deblock_strength( h ); x264_ratecontrol_mb( h, mb_size ); - if( h->sh.b_mbaff ) + if( mb_xy == h->sh.i_last_mb ) + break; + + if( SLICE_MBAFF ) { i_mb_x += i_mb_y & 1; i_mb_y ^= i_mb_x < h->mb.i_mb_width; @@ -2179,6 +2317,7 @@ memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) ); dst->param = src->param; dst->stat = src->stat; + dst->pixf = src->pixf; } static void x264_thread_sync_stat( x264_t *dst, x264_t *src ) @@ -2202,15 +2341,28 @@ /* init stats */ memset( &h->stat.frame, 0, sizeof(h->stat.frame) ); h->mb.b_reencode_mb = 0; - while( h->sh.i_first_mb <= last_thread_mb ) + while( h->sh.i_first_mb + SLICE_MBAFF*h->mb.i_mb_stride <= last_thread_mb ) { h->sh.i_last_mb = last_thread_mb; if( h->param.i_slice_max_mbs ) - h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1; + { + if( SLICE_MBAFF ) + { + // convert first to mbaff form, add slice-max-mbs, then convert back to normal form + int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width) + + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width) + + h->param.i_slice_max_mbs - 1; + int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2; + int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1; + h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y; + } + else + h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1; + } else if( h->param.i_slice_count && !h->param.b_sliced_threads ) { - int height = h->mb.i_mb_height >> h->param.b_interlaced; - int width = h->mb.i_mb_width << h->param.b_interlaced; + int height = h->mb.i_mb_height >> PARAM_INTERLACED; + int width = h->mb.i_mb_width << PARAM_INTERLACED; i_slice_num++; h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1; } @@ -2218,6 +2370,9 @@ if( x264_stack_align( x264_slice_write, h ) ) return (void *)-1; h->sh.i_first_mb = h->sh.i_last_mb + 1; + // if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order + if( SLICE_MBAFF && h->sh.i_first_mb % h->mb.i_mb_width ) + h->sh.i_first_mb -= h->mb.i_mb_stride; } #if HAVE_VISUALIZE @@ -2242,9 +2397,9 @@ t->param = h->param; memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) ); } - int height = h->mb.i_mb_height >> h->param.b_interlaced; - t->i_threadslice_start = ((height * i + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced; - t->i_threadslice_end = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced; + int height = h->mb.i_mb_height >> PARAM_INTERLACED; + t->i_threadslice_start = ((height * i + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED; + t->i_threadslice_end = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED; t->sh.i_first_mb = t->i_threadslice_start * h->mb.i_mb_width; t->sh.i_last_mb = t->i_threadslice_end * h->mb.i_mb_width - 1; } @@ -2270,7 +2425,7 @@ for( int i = 1; i < h->param.i_threads; i++ ) { x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 ); - if( h->sh.b_mbaff ) + if( SLICE_MBAFF ) x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 ); } @@ -2410,7 +2565,11 @@ if( fenc->i_pic_struct == PIC_STRUCT_AUTO ) { +#if HAVE_INTERLACED int b_interlaced = fenc->param ? fenc->param->b_interlaced : h->param.b_interlaced; +#else + int b_interlaced = 0; +#endif if( b_interlaced ) { int b_tff = fenc->param ? fenc->param->b_tff : h->param.b_tff; @@ -2520,7 +2679,7 @@ i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/ h->sh.i_type = SLICE_TYPE_I; x264_reference_hierarchy_reset( h ); - if( h->param.i_open_gop ) + if( h->param.b_open_gop ) h->frames.i_poc_last_open_gop = h->fenc->b_keyframe ? h->fenc->i_poc : -1; } else if( h->fenc->i_type == X264_TYPE_P ) @@ -2695,7 +2854,7 @@ if( h->fenc->i_type != X264_TYPE_IDR ) { - int time_to_recovery = h->param.i_open_gop ? 0 : X264_MIN( h->mb.i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe - 1; + int time_to_recovery = h->param.b_open_gop ? 0 : X264_MIN( h->mb.i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe - 1; x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery ); if( x264_nal_end( h ) ) @@ -2723,6 +2882,17 @@ overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1); } + /* As required by Blu-ray. */ + if( !IS_X264_TYPE_B( h->fenc->i_type ) && h->b_sh_backup ) + { + h->b_sh_backup = 0; + x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); + x264_sei_dec_ref_pic_marking_write( h, &h->out.bs ); + if( x264_nal_end( h ) ) + return -1; + overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1); + } + if( h->fenc->b_keyframe && h->param.b_intra_refresh ) h->i_cpb_delay_pir_offset = h->fenc->i_cpb_delay; @@ -2814,6 +2984,8 @@ } int frame_size = x264_encoder_encapsulate_nals( h, 0 ); + if( frame_size < 0 ) + return -1; /* Set output picture properties */ pic_out->i_type = h->fenc->i_type; @@ -2867,6 +3039,8 @@ if( x264_nal_end( h ) ) return -1; int total_size = x264_encoder_encapsulate_nals( h, h->out.i_nal-1 ); + if( total_size < 0 ) + return -1; frame_size += total_size; filler -= total_size; } @@ -2902,6 +3076,8 @@ for( int i_list = 0; i_list < 2; i_list++ ) for( int i = 0; i < X264_REF_MAX*2; i++ ) h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i]; + for( int i = 0; i < 3; i++ ) + h->stat.i_mb_field[i] += h->stat.frame.i_mb_field[i]; if( h->sh.i_type == SLICE_TYPE_P && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE ) { h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn; @@ -3171,15 +3347,30 @@ int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 ) + SUM3b( h->stat.i_mb_count, I_16x16 ); int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM); + int64_t i_skip = SUM3b( h->stat.i_mb_count, P_SKIP ) + + SUM3b( h->stat.i_mb_count, B_SKIP ); const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B]; + int64_t i_mb_count = (int64_t)i_count * h->mb.i_mb_count; + int64_t i_inter = i_mb_count - i_skip - i_intra; const double duration = h->stat.f_frame_duration[SLICE_TYPE_I] + h->stat.f_frame_duration[SLICE_TYPE_P] + h->stat.f_frame_duration[SLICE_TYPE_B]; - int64_t i_mb_count = (int64_t)i_count * h->mb.i_mb_count; float f_bitrate = SUM3(h->stat.i_frame_size) / duration / 125; + if( PARAM_INTERLACED ) + { + char *fieldstats = buf; + fieldstats[0] = 0; + if( i_inter ) + fieldstats += sprintf( fieldstats, " inter:%.1f%%", h->stat.i_mb_field[1] * 100.0 / i_inter ); + if( i_skip ) + fieldstats += sprintf( fieldstats, " skip:%.1f%%", h->stat.i_mb_field[2] * 100.0 / i_skip ); + x264_log( h, X264_LOG_INFO, "field mbs: intra: %.1f%%%s\n", + h->stat.i_mb_field[0] * 100.0 / i_intra, buf ); + } + if( h->pps->b_transform_8x8_mode ) { buf[0] = 0;

@@ -273,59 +273,19 @@ h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 ); } -static inline int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp ) -{ - dctcoef out[4]; - idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp ); - return ((ref[0] ^ (out[0]+32)) - | (ref[1] ^ (out[1]+32)) - | (ref[2] ^ (out[2]+32)) - | (ref[3] ^ (out[3]+32))) >> 6; -} - /* Round down coefficients losslessly in DC-only chroma blocks. * Unlike luma blocks, this can't be done with a lookup table or * other shortcut technique because of the interdependencies * between the coefficients due to the chroma DC transform. */ -static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, dctcoef dct2x2[4] ) +static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4], int dequant_mf[6][16], int i_qp ) { - dctcoef dct2x2_orig[4]; - int coeff, nz; + int dmf = dequant_mf[i_qp%6][0] << i_qp/6; /* If the QP is too high, there's no benefit to rounding optimization. */ - if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 ) + if( dmf > 32*64 ) return 1; - idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); - dct2x2_orig[0] += 32; - dct2x2_orig[1] += 32; - dct2x2_orig[2] += 32; - dct2x2_orig[3] += 32; - - /* If the DC coefficients already round to zero, terminate early. */ - if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) ) - return 0; - - /* Start with the highest frequency coefficient... is this the best option? */ - for( nz = 0, coeff = h->quantf.coeff_last[DCT_CHROMA_DC]( dct2x2 ); coeff >= 0; coeff-- ) - { - int level = dct2x2[coeff]; - int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */ - - while( level ) - { - dct2x2[coeff] = level - sign; - if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) - { - nz = 1; - dct2x2[coeff] = level; - break; - } - level -= sign; - } - } - - return nz; + return h->quantf.optimize_chroma_dc( dct2x2, dmf ); } void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) @@ -370,7 +330,7 @@ if( nz_dc ) { - if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) ) + if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) continue; h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1; zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); @@ -446,7 +406,7 @@ h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0; if( !nz_dc ) /* Whole block is empty */ continue; - if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) ) + if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) { h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0; continue; @@ -553,7 +513,7 @@ void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode ) { - int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + int stride = h->fenc->i_stride[0] << MB_INTERLACED; pixel *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride; if( i_mode == I_PRED_4x4_V ) @@ -566,7 +526,7 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pixel edge[33] ) { - int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + int stride = h->fenc->i_stride[0] << MB_INTERLACED; pixel *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride; if( i_mode == I_PRED_8x8_V ) @@ -579,7 +539,7 @@ void x264_predict_lossless_16x16( x264_t *h, int i_mode ) { - int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + int stride = h->fenc->i_stride[0] << MB_INTERLACED; if( i_mode == I_PRED_16x16_V ) h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 ); else if( i_mode == I_PRED_16x16_H ) @@ -609,13 +569,8 @@ return; } - if( h->sh.b_mbaff - && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride - && IS_SKIP(h->mb.type[h->sh.i_first_mb]) ) + if( !h->mb.b_allow_skip ) { - /* The first skip is predicted to be a frame mb pair. - * We don't yet support the aff part of mbaff, so force it to non-skip - * so that we can pick the aff flag. */ b_force_no_skip = 1; if( IS_SKIP(h->mb.i_type) ) {

@@ -814,7 +814,7 @@ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; const int i_pixel = m->i_pixel; const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; - const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment @@ -951,8 +951,8 @@ pixel *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE]; int ref0 = h->mb.cache.ref[0][s8]; int ref1 = h->mb.cache.ref[1][s8]; - const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mv0y_offset = MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mv1y_offset = MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; int stride[2][9]; int bm0x = m0->mv[0]; int bm0y = m0->mv[1]; @@ -965,7 +965,7 @@ /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] ); /* all permutations of an offset in up to 2 of the dimensions */ - static const int8_t dia4d[33][4] = + ALIGNED_4( static const int8_t dia4d[33][4] ) = { {0,0,0,0}, {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0}, @@ -1129,14 +1129,13 @@ const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; - const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; uint64_t bcost = COST_MAX64; int bmx = m->mv[0]; int bmy = m->mv[1]; int omx, omy, pmx, pmy; - unsigned bsatd; - int satd; + int satd, bsatd; int dir = -2; int i8 = i4>>2; uint16_t amvd; @@ -1227,7 +1226,7 @@ m->mv[0] = bmx; m->mv[1] = bmy; x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) ); - amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33) ); + amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),66), X264_MIN(abs(bmy - m->mvp[1]),66) ); x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd ); h->mb.b_skip_mc = 0; }

@@ -55,7 +55,8 @@ ALIGNED_4( int16_t mv[2] ); } ALIGNED_16( x264_me_t ); -typedef struct { +typedef struct +{ int sad; int16_t mv[2]; } mvsad_t;

@@ -29,7 +29,6 @@ #define _ISOC99_SOURCE #undef NDEBUG // always check asserts, the speed effect is far too small to disable them -#include <math.h> #include "common/common.h" #include "ratecontrol.h" @@ -63,10 +62,10 @@ typedef struct { - double coeff; - double count; - double decay; - double offset; + float coeff; + float count; + float decay; + float offset; } predictor_t; struct x264_ratecontrol_t @@ -88,7 +87,7 @@ int qp; /* qp for current frame */ float qpm; /* qp for current macroblock: precise float for AQ */ float qpa_rc; /* average of macroblocks' qp before aq */ - float qpa_aq; /* average of macroblocks' qp after aq */ + int qpa_aq; /* average of macroblocks' qp after aq */ float qp_novbv; /* QP for the current frame if 1-pass VBV was disabled. */ /* VBV stuff */ @@ -168,8 +167,8 @@ static float rate_estimate_qscale( x264_t *h ); static int update_vbv( x264_t *h, int bits ); static void update_vbv_plan( x264_t *h, int overhead ); -static double predict_size( predictor_t *p, double q, double var ); -static void update_predictor( predictor_t *p, double q, double var, double bits ); +static float predict_size( predictor_t *p, float q, float var ); +static void update_predictor( predictor_t *p, float q, float var, float bits ); #define CMP_OPT_FIRST_PASS( opt, param_val )\ {\ @@ -184,13 +183,13 @@ * qp = h.264's quantizer * qscale = linearized quantizer = Lagrange multiplier */ -static inline double qp2qscale( double qp ) +static inline float qp2qscale( float qp ) { - return 0.85 * pow( 2.0, ( qp - 12.0 ) / 6.0 ); + return 0.85f * powf( 2.0f, ( qp - 12.0f ) / 6.0f ); } -static inline double qscale2qp( double qscale ) +static inline float qscale2qp( float qscale ) { - return 12.0 + 6.0 * log2( qscale/0.85 ); + return 12.0f + 6.0f * log2f( qscale/0.85f ); } /* Texture bitrate is not quite inversely proportional to qscale, @@ -206,32 +205,35 @@ + rce->misc_bits; } -static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t *frame, int i ) +static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t *frame, int i, int b_store ) { uint32_t sum = sum_ssd; uint32_t ssd = sum_ssd >> 32; - frame->i_pixel_sum[i] += sum; - frame->i_pixel_ssd[i] += ssd; + if( b_store ) + { + frame->i_pixel_sum[i] += sum; + frame->i_pixel_ssd[i] += ssd; + } return ssd - ((uint64_t)sum * sum >> shift); } -static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i ) +static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int field, int b_store ) { int w = i ? 8 : 16; int stride = frame->i_stride[i]; - int offset = h->mb.b_interlaced + int offset = field ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride : 16 * mb_x + w * mb_y * stride; - stride <<= h->mb.b_interlaced; + stride <<= field; if( i ) { ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] ); h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride ); - return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1 ) - + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2 ); + return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1, b_store ) + + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2, b_store ); } else - return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, 0 ); + return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, 0, b_store ); } // Find the total AC energy of the block in all planes. @@ -241,8 +243,23 @@ * and putting it after floating point ops. As a result, we put the emms at the end of the * function and make sure that its always called before the float math. Noinline makes * sure no reordering goes on. */ - uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 ); - var += ac_energy_plane( h, mb_x, mb_y, frame, 1 ); + uint32_t var; + if( h->mb.b_adaptive_mbaff ) + { + /* We don't know the super-MB mode we're going to pick yet, so + * simply try both and pick the lower of the two. */ + uint32_t var_interlaced, var_progressive; + var_interlaced = ac_energy_plane( h, mb_x, mb_y, frame, 0, 1, 1 ); + var_interlaced += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 1 ); + var_progressive = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 0 ); + var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 0 ); + var = X264_MIN( var_interlaced, var_progressive ); + } + else + { + var = ac_energy_plane( h, mb_x, mb_y, frame, 0, PARAM_INTERLACED, 1 ); + var += ac_energy_plane( h, mb_x, mb_y, frame, 1, PARAM_INTERLACED, 1 ); + } x264_emms(); return var; } @@ -460,6 +477,11 @@ if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 ) { + /* We don't support changing the ABR bitrate right now, + so if the stream starts as CBR, keep it CBR. */ + if( rc->b_vbv_min_rate ) + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; + if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) ) { h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps; @@ -467,17 +489,10 @@ h->param.rc.i_vbv_buffer_size ); } - /* We don't support changing the ABR bitrate right now, - so if the stream starts as CBR, keep it CBR. */ - if( rc->b_vbv_min_rate ) - h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; - int vbv_buffer_size = h->param.rc.i_vbv_buffer_size * 1000; int vbv_max_bitrate = h->param.rc.i_vbv_max_bitrate * 1000; /* Init HRD */ - h->sps->vui.hrd.i_bit_rate_unscaled = vbv_max_bitrate; - h->sps->vui.hrd.i_cpb_size_unscaled = vbv_buffer_size; if( h->param.i_nal_hrd && b_init ) { h->sps->vui.hrd.i_cpb_cnt = 1; @@ -522,7 +537,11 @@ x264_log( h, X264_LOG_WARNING, "VBV parameters cannot be changed when NAL HRD is in use\n" ); return; } + h->sps->vui.hrd.i_bit_rate_unscaled = vbv_max_bitrate; + h->sps->vui.hrd.i_cpb_size_unscaled = vbv_buffer_size; + if( rc->b_vbv_min_rate ) + rc->bitrate = h->param.rc.i_bitrate * 1000.; rc->buffer_rate = vbv_max_bitrate / rc->fps; rc->vbv_max_rate = vbv_max_bitrate; rc->buffer_size = vbv_buffer_size; @@ -736,7 +755,8 @@ CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe ); CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid ); CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh ); - CMP_OPT_FIRST_PASS( "open_gop", h->param.i_open_gop ); + CMP_OPT_FIRST_PASS( "open_gop", h->param.b_open_gop ); + CMP_OPT_FIRST_PASS( "bluray_compat", h->param.b_bluray_compat ); if( (p = strstr( opts, "keyint=" )) ) { @@ -1199,6 +1219,8 @@ if( rc->b_vbv ) { memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) ); + memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) ); + memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) ); rc->row_pred = &rc->row_preds[h->sh.i_type]; rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; update_vbv_plan( h, overhead ); @@ -1209,8 +1231,7 @@ int mincr = l->mincr; - /* Blu-ray requires this */ - if( l->level_idc == 41 && h->param.i_nal_hrd ) + if( h->param.b_bluray_compat ) mincr = 4; /* High 10 doesn't require minCR, so just set the maximum to a large value. */ @@ -1237,11 +1258,7 @@ if( h->sh.i_type != SLICE_TYPE_B ) rc->bframes = h->fenc->i_bframes; - if( i_force_qp != X264_QP_AUTO ) - { - q = i_force_qp - 1; - } - else if( rc->b_abr ) + if( rc->b_abr ) { q = qscale2qp( rate_estimate_qscale( h ) ); } @@ -1265,12 +1282,14 @@ q -= 6*log2f( zone->f_bitrate_factor ); } } + if( i_force_qp != X264_QP_AUTO ) + q = i_force_qp - 1; q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); rc->qpa_rc = rc->qpa_aq = 0; - rc->qp = x264_clip3( (int)(q + 0.5), 0, QP_MAX ); + rc->qp = x264_clip3( q + 0.5f, 0, QP_MAX ); h->fdec->f_qp_avg_rc = h->fdec->f_qp_avg_aq = rc->qpm = q; @@ -1283,72 +1302,84 @@ rc->last_non_b_pict_type = h->sh.i_type; } -static double predict_row_size( x264_t *h, int y, double qp ) +static float predict_row_size( x264_t *h, int y, float qscale ) { /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame */ x264_ratecontrol_t *rc = h->rc; - double pred_s = predict_size( rc->row_pred[0], qp2qscale( qp ), h->fdec->i_row_satd[y] ); - double pred_t = 0; - if( h->sh.i_type == SLICE_TYPE_I || qp >= h->fref[0][0]->f_row_qp[y] ) + float pred_s = predict_size( rc->row_pred[0], qscale, h->fdec->i_row_satd[y] ); + if( h->sh.i_type == SLICE_TYPE_I || qscale >= h->fref[0][0]->f_row_qscale[y] ) { if( h->sh.i_type == SLICE_TYPE_P && h->fref[0][0]->i_type == h->fdec->i_type + && h->fref[0][0]->f_row_qscale[y] > 0 && h->fref[0][0]->i_row_satd[y] > 0 && (abs(h->fref[0][0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2)) { - pred_t = h->fref[0][0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref[0][0]->i_row_satd[y] - * qp2qscale( h->fref[0][0]->f_row_qp[y] ) / qp2qscale( qp ); + float pred_t = h->fref[0][0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref[0][0]->i_row_satd[y] + * h->fref[0][0]->f_row_qscale[y] / qscale; + return (pred_s + pred_t) * 0.5f; } - if( pred_t == 0 ) - pred_t = pred_s; - return (pred_s + pred_t) / 2; + return pred_s; } /* Our QP is lower than the reference! */ else { - double pred_intra = predict_size( rc->row_pred[1], qp2qscale( qp ), h->fdec->i_row_satds[0][0][y] ); + float pred_intra = predict_size( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] ); /* Sum: better to overestimate than underestimate by using only one of the two predictors. */ return pred_intra + pred_s; } } -static double row_bits_so_far( x264_t *h, int y ) +static int row_bits_so_far( x264_t *h, int y ) { - double bits = 0; + int bits = 0; for( int i = h->i_threadslice_start; i <= y; i++ ) bits += h->fdec->i_row_bits[i]; return bits; } -static double predict_row_size_sum( x264_t *h, int y, double qp ) +static float predict_row_size_sum( x264_t *h, int y, float qp ) { - double bits = row_bits_so_far(h, y); + float qscale = qp2qscale( qp ); + float bits = row_bits_so_far( h, y ); for( int i = y+1; i < h->i_threadslice_end; i++ ) - bits += predict_row_size( h, i, qp ); + bits += predict_row_size( h, i, qscale ); return bits; } - +/* TODO: + * eliminate all use of qp in row ratecontrol: make it entirely qscale-based. + * make this function stop being needlessly O(N^2) + * update more often than once per row? */ void x264_ratecontrol_mb( x264_t *h, int bits ) { x264_ratecontrol_t *rc = h->rc; const int y = h->mb.i_mb_y; - x264_emms(); - h->fdec->i_row_bits[y] += bits; - rc->qpa_rc += rc->qpm; rc->qpa_aq += h->mb.i_qp; - if( h->mb.i_mb_x != h->mb.i_mb_width - 1 || !rc->b_vbv ) + if( h->mb.i_mb_x != h->mb.i_mb_width - 1 ) + return; + + x264_emms(); + rc->qpa_rc += rc->qpm * h->mb.i_mb_width; + + if( !rc->b_vbv ) return; + float qscale = qp2qscale( rc->qpm ); h->fdec->f_row_qp[y] = rc->qpm; + h->fdec->f_row_qscale[y] = qscale; - update_predictor( rc->row_pred[0], qp2qscale( rc->qpm ), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); + update_predictor( rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref[0][0]->f_row_qp[y] ) - update_predictor( rc->row_pred[1], qp2qscale( rc->qpm ), h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); + update_predictor( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); + + /* update ratecontrol per-mbpair in MBAFF */ + if( SLICE_MBAFF && !(y&1) ) + return; /* tweak quality based on difference from predicted size */ if( y < h->i_threadslice_end-1 ) @@ -1359,7 +1390,7 @@ if( rc->rate_factor_max_increment ) qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment ); float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max ); - float step_size = 0.5; + float step_size = 0.5f; /* B-frames shouldn't use lower QP than their reference frames. */ if( h->sh.i_type == SLICE_TYPE_B ) @@ -1370,7 +1401,7 @@ float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned; float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned; - float max_frame_error = X264_MAX( 0.05, 1.0 / (h->mb.i_mb_height) ); + float max_frame_error = X264_MAX( 0.05f, 1.0f / h->mb.i_mb_height ); float size_of_other_slices = 0; if( h->param.b_sliced_threads ) { @@ -1387,22 +1418,22 @@ /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */ float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance; - int b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; + float b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */ /* area at the top of the frame was measured inaccurately. */ - if( row_bits_so_far( h, y ) < 0.05 * slice_size_planned ) + if( row_bits_so_far( h, y ) < 0.05f * slice_size_planned ) return; if( h->sh.i_type != SLICE_TYPE_I ) - rc_tol /= 2; + rc_tol *= 0.5f; if( !rc->b_vbv_min_rate ) qp_min = X264_MAX( qp_min, rc->qp_novbv ); while( rc->qpm < qp_max && ((b1 > rc->frame_size_planned + rc_tol) || - (rc->buffer_fill - b1 < buffer_left_planned * 0.5) || + (rc->buffer_fill - b1 < buffer_left_planned * 0.5f) || (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) ) { rc->qpm += step_size; @@ -1411,8 +1442,8 @@ while( rc->qpm > qp_min && (rc->qpm > h->fdec->f_row_qp[0] || rc->single_frame_vbv) - && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp) - || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) ) + && ((b1 < rc->frame_size_planned * 0.8f && rc->qpm <= prev_row_qp) + || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1f) ) { rc->qpm -= step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; @@ -1427,14 +1458,16 @@ b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } - h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm ); + h->rc->frame_size_estimated = b1 - size_of_other_slices; } + else + h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm ); } int x264_ratecontrol_qp( x264_t *h ) { x264_emms(); - return x264_clip3( h->rc->qpm + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + return x264_clip3( h->rc->qpm + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } int x264_ratecontrol_mb_qp( x264_t *h ) @@ -1450,7 +1483,7 @@ qp_offset *= (QP_MAX - qp) / (QP_MAX - QP_MAX_SPEC); qp += qp_offset; } - return x264_clip3( qp + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + return x264_clip3( qp + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } /* In 2pass, force the same frame types as in the 1st pass */ @@ -1526,7 +1559,7 @@ h->stat.frame.i_mb_count_p += mbs[i]; h->fdec->f_qp_avg_rc = rc->qpa_rc /= h->mb.i_mb_count; - h->fdec->f_qp_avg_aq = rc->qpa_aq /= h->mb.i_mb_count; + h->fdec->f_qp_avg_aq = (float)rc->qpa_aq / h->mb.i_mb_count; if( h->param.rc.b_stat_write ) { @@ -1558,7 +1591,7 @@ for( int i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref[0]); i++ ) { int refcount = use_old_stats ? rc->rce->refcount[i] - : h->param.b_interlaced ? h->stat.frame.i_mb_count_ref[0][i*2] + : PARAM_INTERLACED ? h->stat.frame.i_mb_count_ref[0][i*2] + h->stat.frame.i_mb_count_ref[0][i*2+1] : h->stat.frame.i_mb_count_ref[0][i]; if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 ) @@ -1689,7 +1722,14 @@ { x264_ratecontrol_t *rcc= h->rc; x264_zone_t *zone = get_zone( h, frame_num ); - double q = pow( rce->blurred_complexity, 1 - rcc->qcompress ); + double q; + if( h->param.rc.b_mb_tree ) + { + double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; + q = pow( BASE_FRAME_DURATION / CLIP_DURATION(rce->i_duration * timescale), 1 - h->param.rc.f_qcompress ); + } + else + q = pow( rce->blurred_complexity, 1 - rcc->qcompress ); // avoid NaN's in the rc_eq if( !isfinite(q) || rce->tex_bits + rce->mv_bits == 0 ) @@ -1712,10 +1752,11 @@ return q; } -static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q) +static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q, int frame_num) { x264_ratecontrol_t *rcc = h->rc; const int pict_type = rce->pict_type; + x264_zone_t *zone = get_zone( h, frame_num ); // force I/B quants as a function of P quants const double last_p_q = rcc->last_qscale_for[SLICE_TYPE_P]; @@ -1776,23 +1817,32 @@ rcc->accum_p_qp = mask * (qscale2qp( q ) + rcc->accum_p_qp); rcc->accum_p_norm = mask * (1 + rcc->accum_p_norm); } + + if( zone ) + { + if( zone->b_force_qp ) + q = qp2qscale( zone->i_qp ); + else + q /= zone->f_bitrate_factor; + } + return q; } -static double predict_size( predictor_t *p, double q, double var ) +static float predict_size( predictor_t *p, float q, float var ) { - return (p->coeff*var + p->offset) / (q*p->count); + return (p->coeff*var + p->offset) / (q*p->count); } -static void update_predictor( predictor_t *p, double q, double var, double bits ) +static void update_predictor( predictor_t *p, float q, float var, float bits ) { - const double range = 1.5; + float range = 1.5; if( var < 10 ) return; - double old_coeff = p->coeff / p->count; - double new_coeff = bits*q / var; - double new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range ); - double new_offset = bits*q - new_coeff_clipped * var; + float old_coeff = p->coeff / p->count; + float new_coeff = bits*q / var; + float new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range ); + float new_offset = bits*q - new_coeff_clipped * var; if( new_offset >= 0 ) new_coeff = new_coeff_clipped; else @@ -1829,7 +1879,8 @@ if( h->sps->vui.hrd.b_cbr_hrd && rct->buffer_fill_final > buffer_size ) { - filler = ceil( (rct->buffer_fill_final - buffer_size) / (8. * h->sps->vui.i_time_scale) ); + int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8; + filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale; bits = X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8; rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale; } @@ -1871,7 +1922,7 @@ double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; - bits = X264_MAX(bits, t->rc->frame_size_estimated); + bits = X264_MAX(bits, t->rc->frame_size_estimated); rcc->buffer_fill -= bits; rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 ); rcc->buffer_fill += t->rc->buffer_rate; @@ -2084,6 +2135,9 @@ rcc->frame_size_planned = qscale2bits( &rce, qp2qscale( q ) ); else rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, qp2qscale( q ), h->fref[1][h->i_ref[1]-1]->i_satd ); + /* Limit planned size by MinCR */ + if( rcc->b_vbv ) + rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum ); h->rc->frame_size_estimated = rcc->frame_size_planned; /* For row SATDs */ @@ -2114,7 +2168,7 @@ double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; - bits = X264_MAX(bits, t->rc->frame_size_estimated); + bits = X264_MAX(bits, t->rc->frame_size_estimated); predicted_bits += (int64_t)bits; } } @@ -2199,6 +2253,7 @@ rce.s_count = 0; rce.qscale = 1; rce.pict_type = pict_type; + rce.i_duration = h->fenc->i_duration; if( h->param.rc.i_rc_method == X264_RC_CRF ) { @@ -2274,6 +2329,9 @@ /* Always use up the whole VBV in this case. */ if( rcc->single_frame_vbv ) rcc->frame_size_planned = rcc->buffer_rate; + /* Limit planned size by MinCR */ + if( rcc->b_vbv ) + rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum ); h->rc->frame_size_estimated = rcc->frame_size_planned; return q; } @@ -2384,13 +2442,14 @@ COPY(prev_zone); COPY(qpbuf_pos); /* these vars can be updated by x264_ratecontrol_init_reconfigurable */ - COPY(buffer_rate); + COPY(bitrate); COPY(buffer_size); + COPY(buffer_rate); + COPY(vbv_max_rate); COPY(single_frame_vbv); COPY(cbr_decay); - COPY(b_vbv_min_rate); COPY(rate_factor_constant); - COPY(bitrate); + COPY(rate_factor_max_increment); #undef COPY } if( cur != next ) @@ -2651,14 +2710,14 @@ /* find qscale */ for( int i = 0; i < rcc->num_entries; i++ ) { - qscale[i] = get_qscale( h, &rcc->entry[i], rate_factor, i ); + qscale[i] = get_qscale( h, &rcc->entry[i], rate_factor, -1 ); rcc->last_qscale_for[rcc->entry[i].pict_type] = qscale[i]; } /* fixed I/B qscale relative to P */ for( int i = rcc->num_entries-1; i >= 0; i-- ) { - qscale[i] = get_diff_limited_q( h, &rcc->entry[i], qscale[i] ); + qscale[i] = get_diff_limited_q( h, &rcc->entry[i], qscale[i], i ); assert(qscale[i] >= 0); }

@@ -205,8 +205,8 @@ /* Really should be 15 bytes, but rounding up a byte saves some * instructions and is faster, and copying extra data doesn't hurt. */ - COPY_CABAC_PART( significant_coeff_flag_offset[h->mb.b_interlaced][cat], 16 ); - COPY_CABAC_PART( last_coeff_flag_offset[h->mb.b_interlaced][cat], 16 ); + COPY_CABAC_PART( significant_coeff_flag_offset[MB_INTERLACED][cat], 16 ); + COPY_CABAC_PART( last_coeff_flag_offset[MB_INTERLACED][cat], 16 ); COPY_CABAC_PART( coeff_abs_level_m1_offset[cat], 10 ); cb->f8_bits_encoded = 0; } @@ -387,7 +387,8 @@ } } -typedef struct { +typedef struct +{ int64_t score; int level_idx; // index into level_tree[] uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1 @@ -425,7 +426,7 @@ trellis_node_t *nodes_cur = nodes[0]; trellis_node_t *nodes_prev = nodes[1]; trellis_node_t *bnode; - const int b_interlaced = h->mb.b_interlaced; + const int b_interlaced = MB_INTERLACED; uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; const int f = 1 << 15; // no deadzone @@ -435,7 +436,8 @@ // (# of coefs) * (# of ctx) * (# of levels tried) = 1024 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough, // but it takes more time to remove dead states than you gain in reduced memory. - struct { + struct + { uint16_t abs_level; uint16_t next; } level_tree[64*8*2]; @@ -839,12 +841,12 @@ if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], + NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, 0 ); return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], + NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, 0, 0 ); } @@ -855,14 +857,14 @@ if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - x264_dct4_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan4[h->mb.b_interlaced], + x264_dct4_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx ); return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - x264_dct4_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan4[h->mb.b_interlaced], + x264_dct4_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx, 0 ); } @@ -873,8 +875,8 @@ { return quant_trellis_cabac( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], - x264_dct8_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan8[h->mb.b_interlaced], + x264_dct8_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan8[MB_INTERLACED], DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx ); } @@ -884,8 +886,8 @@ { int nz = quant_trellis_cavlc( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], - x264_dct8_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan8[h->mb.b_interlaced], + x264_dct8_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan8[MB_INTERLACED], DCT_LUMA_4x4, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 16, idx*4+i, 1 ); /* Set up nonzero count for future calls */ h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;

@@ -24,8 +24,6 @@ * For more information, contact us at licensing@x264.com. *****************************************************************************/ -#include <math.h> - #include "common/common.h" #include "set.h" @@ -207,18 +205,18 @@ sps->vui.i_sar_height= param->vui.i_sar_height; } - sps->vui.b_overscan_info_present = ( param->vui.i_overscan ? 1 : 0 ); + sps->vui.b_overscan_info_present = param->vui.i_overscan > 0 && param->vui.i_overscan <= 2; if( sps->vui.b_overscan_info_present ) sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 ); sps->vui.b_signal_type_present = 0; - sps->vui.i_vidformat = ( param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 ); + sps->vui.i_vidformat = ( param->vui.i_vidformat >= 0 && param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 ); sps->vui.b_fullrange = ( param->vui.b_fullrange ? 1 : 0 ); sps->vui.b_color_description_present = 0; - sps->vui.i_colorprim = ( param->vui.i_colorprim <= 9 ? param->vui.i_colorprim : 2 ); - sps->vui.i_transfer = ( param->vui.i_transfer <= 11 ? param->vui.i_transfer : 2 ); - sps->vui.i_colmatrix = ( param->vui.i_colmatrix <= 9 ? param->vui.i_colmatrix : 2 ); + sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 8 ? param->vui.i_colorprim : 2 ); + sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 10 ? param->vui.i_transfer : 2 ); + sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 8 ? param->vui.i_colmatrix : 2 ); if( sps->vui.i_colorprim != 2 || sps->vui.i_transfer != 2 || sps->vui.i_colmatrix != 2 ) @@ -234,7 +232,7 @@ } /* FIXME: not sufficient for interlaced video */ - sps->vui.b_chroma_loc_info_present = ( param->vui.i_chroma_loc ? 1 : 0 ); + sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5; if( sps->vui.b_chroma_loc_info_present ) { sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc; @@ -553,7 +551,6 @@ bs_flush( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_RECOVERY_POINT ); - } int x264_sei_version_write( x264_t *h, bs_t *s ) @@ -691,6 +688,38 @@ bs_flush( s ); } +void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s ) +{ + x264_slice_header_t *sh = &h->sh_backup; + bs_t q; + uint8_t tmp_buf[100]; + bs_init( &q, tmp_buf, 100 ); + + bs_realign( &q ); + + /* We currently only use this for repeating B-refs, as required by Blu-ray. */ + bs_write1( &q, 0 ); //original_idr_flag + bs_write_ue( &q, sh->i_frame_num ); //original_frame_num + if( !h->sps->b_frame_mbs_only ) + bs_write1( &q, 0 ); //original_field_pic_flag + + bs_write1( &q, sh->i_mmco_command_count > 0 ); + if( sh->i_mmco_command_count > 0 ) + { + for( int i = 0; i < sh->i_mmco_command_count; i++ ) + { + bs_write_ue( &q, 1 ); + bs_write_ue( &q, sh->mmco[i].i_difference_of_pic_nums - 1 ); + } + bs_write_ue( &q, 0 ); + } + + bs_align_10( &q ); + bs_flush( &q ); + + x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_DEC_REF_PIC_MARKING ); +} + const x264_level_t x264_levels[] = { { 10, 1485, 99, 152064, 64, 175, 64, 64, 0, 2, 0, 0, 1 },

@@ -36,6 +36,7 @@ int x264_validate_levels( x264_t *h, int verbose ); void x264_sei_buffering_period_write( x264_t *h, bs_t *s ); void x264_sei_pic_timing_write( x264_t *h, bs_t *s ); +void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s ); void x264_sei_frame_packing_write( x264_t *h, bs_t *s ); void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type ); void x264_filler_write( x264_t *h, bs_t *s, int filler );

@@ -25,8 +25,6 @@ * For more information, contact us at licensing@x264.com. *****************************************************************************/ -#include <math.h> - #include "common/common.h" #include "macroblock.h" #include "me.h" @@ -169,14 +167,18 @@ for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8) { w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 ); - cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); + int cmp = h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ); + cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] ); } cost += x264_weight_slice_header_cost( h, w, 0 ); } else for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 ) - cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); + { + int cmp = h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ); + cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] ); + } x264_emms(); return cost; } @@ -369,8 +371,8 @@ const int i_stride = fenc->i_stride_lowres; const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride); const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; - int16_t (*fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] }; - int (*fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] }; + int16_t (*fenc_mvs[2])[2] = { &fenc->lowres_mvs[0][b-p0-1][i_mb_xy], &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] }; + int (*fenc_costs[2]) = { &fenc->lowres_mv_costs[0][b-p0-1][i_mb_xy], &fenc->lowres_mv_costs[1][p1-b-1][i_mb_xy] }; int b_frame_score_mb = (i_mb_x > 0 && i_mb_x < h->mb.i_mb_width - 1 && i_mb_y > 0 && i_mb_y < h->mb.i_mb_height - 1) || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2; @@ -578,15 +580,14 @@ i_icost += intra_penalty; fenc->i_intra_cost[i_mb_xy] = i_icost; + int i_icost_aq = i_icost; + if( h->param.rc.i_aq_mode ) + i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; + fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq; if( b_frame_score_mb ) { - int *row_satd_intra = frames[b]->i_row_satds[0][0]; - int i_icost_aq = i_icost; - if( h->param.rc.i_aq_mode ) - i_icost_aq = (i_icost_aq * frames[b]->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; fenc->i_cost_est[0][0] += i_icost; fenc->i_cost_est_aq[0][0] += i_icost_aq; - row_satd_intra[h->mb.i_mb_y] += i_icost_aq; } } @@ -610,13 +611,13 @@ { int i_bcost_aq = i_bcost; if( h->param.rc.i_aq_mode ) - i_bcost_aq = (i_bcost_aq * frames[b]->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; + i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq; if( b_frame_score_mb ) { /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */ - frames[b]->i_cost_est[b-p0][p1-b] += i_bcost; - frames[b]->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq; + fenc->i_cost_est[b-p0][p1-b] += i_bcost; + fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq; } } @@ -750,8 +751,7 @@ static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance ) { - int fps_factor_intra = round( CLIP_DURATION(frame->f_duration) / BASE_FRAME_DURATION * 256 ); - int fps_factor_propagate = round( CLIP_DURATION( average_duration) / BASE_FRAME_DURATION * 256 ); + int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 ); float weightdelta = 0.0; if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 ) weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]); @@ -762,11 +762,10 @@ for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ ) { int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8; - int intra_cost_scaled = (intra_cost * fps_factor_intra + 128) >> 8; if( intra_cost ) { - int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor_propagate + 128) >> 8; - float log2_ratio = x264_log2(intra_cost_scaled + propagate_cost) - x264_log2(intra_cost) + weightdelta; + int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor + 128) >> 8; + float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta; frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio; } } @@ -1103,9 +1102,9 @@ /* Uses strings due to the fact that the speed of the control functions is negligible compared to the cost of running slicetype_frame_cost, and because it makes debugging easier. */ -static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, char (*best_paths)[X264_LOOKAHEAD_MAX] ) +static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, char (*best_paths)[X264_LOOKAHEAD_MAX+1] ) { - char paths[2][X264_LOOKAHEAD_MAX]; + char paths[2][X264_LOOKAHEAD_MAX+1]; int num_paths = X264_MIN( h->param.i_bframe+1, length ); int best_cost = COST_MAX; int idx = 0; @@ -1256,6 +1255,8 @@ * more RD-optimal. */ if( (h->param.analyse.b_psy && h->param.rc.b_mb_tree) || vbv_lookahead ) num_frames = framecnt; + else if( h->param.b_open_gop && num_frames < framecnt ) + num_frames++; else if( num_frames == 0 ) { frames[1]->i_type = X264_TYPE_I; @@ -1277,11 +1278,11 @@ { if( num_frames > 1 ) { - char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX] = {"","P"}; - int best_path_index = (num_frames-1) % (X264_BFRAME_MAX+1); + char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX+1] = {"","P"}; + int best_path_index = num_frames % (X264_BFRAME_MAX+1); /* Perform the frametype analysis. */ - for( int j = 2; j < num_frames; j++ ) + for( int j = 2; j <= num_frames; j++ ) x264_slicetype_path( h, &a, frames, j, best_paths ); num_bframes = strspn( best_paths[best_path_index], "B" ); @@ -1375,7 +1376,7 @@ { frames[i]->i_type = X264_TYPE_I; reset_start = X264_MIN( reset_start, i+1 ); - if( h->param.i_open_gop == X264_OPEN_GOP_BLURAY ) + if( h->param.b_open_gop && h->param.b_bluray_compat ) while( IS_X264_TYPE_B( frames[i-1]->i_type ) ) i--; } @@ -1463,25 +1464,25 @@ } if( frm->i_type == X264_TYPE_KEYFRAME ) - frm->i_type = h->param.i_open_gop ? X264_TYPE_I : X264_TYPE_IDR; + frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR; /* Limit GOP size */ if( (!h->param.b_intra_refresh || frm->i_frame == 0) && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_max ) { if( frm->i_type == X264_TYPE_AUTO || frm->i_type == X264_TYPE_I ) - frm->i_type = h->param.i_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR; + frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR; int warn = frm->i_type != X264_TYPE_IDR; - if( warn && h->param.i_open_gop ) + if( warn && h->param.b_open_gop ) warn &= frm->i_type != X264_TYPE_I; if( warn ) x264_log( h, X264_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm->i_type, frm->i_frame ); } if( frm->i_type == X264_TYPE_I && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min ) { - if( h->param.i_open_gop ) + if( h->param.b_open_gop ) { h->lookahead->i_last_keyframe = frm->i_frame; // Use display order - if( h->param.i_open_gop == X264_OPEN_GOP_BLURAY ) + if( h->param.b_bluray_compat ) h->lookahead->i_last_keyframe -= bframes; // Use bluray order frm->b_keyframe = 1; } @@ -1655,7 +1656,7 @@ int ip_factor = 256 * h->param.rc.f_ip_factor; /* fix8 */ for( int y = 0; y < h->mb.i_mb_height; y++ ) { - int mb_xy = y * h->mb.i_mb_stride; + int mb_xy = y * h->mb.i_mb_stride + h->fdec->i_pir_start_col; for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ ) { int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;

@@ -0,0 +1,285 @@ +// ISO C9x compliant inttypes.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_INTTYPES_H_ // [ +#define _MSC_INTTYPES_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include "stdint.h" + +// 7.8 Format conversion of integer types + +typedef struct { + intmax_t quot; + intmax_t rem; +} imaxdiv_t; + +// 7.8.1 Macros for format specifiers + +#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 + +// The fprintf macros for signed integers are: +#define PRId8 "d" +#define PRIi8 "i" +#define PRIdLEAST8 "d" +#define PRIiLEAST8 "i" +#define PRIdFAST8 "d" +#define PRIiFAST8 "i" + +#define PRId16 "hd" +#define PRIi16 "hi" +#define PRIdLEAST16 "hd" +#define PRIiLEAST16 "hi" +#define PRIdFAST16 "hd" +#define PRIiFAST16 "hi" + +#define PRId32 "I32d" +#define PRIi32 "I32i" +#define PRIdLEAST32 "I32d" +#define PRIiLEAST32 "I32i" +#define PRIdFAST32 "I32d" +#define PRIiFAST32 "I32i" + +#define PRId64 "I64d" +#define PRIi64 "I64i" +#define PRIdLEAST64 "I64d" +#define PRIiLEAST64 "I64i" +#define PRIdFAST64 "I64d" +#define PRIiFAST64 "I64i" + +#define PRIdMAX "I64d" +#define PRIiMAX "I64i" + +#define PRIdPTR "Id" +#define PRIiPTR "Ii" + +// The fprintf macros for unsigned integers are: +#define PRIo8 "o" +#define PRIu8 "u" +#define PRIx8 "x" +#define PRIX8 "X" +#define PRIoLEAST8 "o" +#define PRIuLEAST8 "u" +#define PRIxLEAST8 "x" +#define PRIXLEAST8 "X" +#define PRIoFAST8 "o" +#define PRIuFAST8 "u" +#define PRIxFAST8 "x" +#define PRIXFAST8 "X" + +#define PRIo16 "ho" +#define PRIu16 "hu" +#define PRIx16 "hx" +#define PRIX16 "hX" +#define PRIoLEAST16 "ho" +#define PRIuLEAST16 "hu" +#define PRIxLEAST16 "hx" +#define PRIXLEAST16 "hX" +#define PRIoFAST16 "ho" +#define PRIuFAST16 "hu" +#define PRIxFAST16 "hx" +#define PRIXFAST16 "hX" + +#define PRIo32 "I32o" +#define PRIu32 "I32u" +#define PRIx32 "I32x" +#define PRIX32 "I32X" +#define PRIoLEAST32 "I32o" +#define PRIuLEAST32 "I32u" +#define PRIxLEAST32 "I32x" +#define PRIXLEAST32 "I32X" +#define PRIoFAST32 "I32o" +#define PRIuFAST32 "I32u" +#define PRIxFAST32 "I32x" +#define PRIXFAST32 "I32X" + +#define PRIo64 "I64o" +#define PRIu64 "I64u" +#define PRIx64 "I64x" +#define PRIX64 "I64X" +#define PRIoLEAST64 "I64o" +#define PRIuLEAST64 "I64u" +#define PRIxLEAST64 "I64x" +#define PRIXLEAST64 "I64X" +#define PRIoFAST64 "I64o" +#define PRIuFAST64 "I64u" +#define PRIxFAST64 "I64x" +#define PRIXFAST64 "I64X" + +#define PRIoMAX "I64o" +#define PRIuMAX "I64u" +#define PRIxMAX "I64x" +#define PRIXMAX "I64X" + +#define PRIoPTR "Io" +#define PRIuPTR "Iu" +#define PRIxPTR "Ix" +#define PRIXPTR "IX" + +// The fscanf macros for signed integers are: +#define SCNd16 "hd" +#define SCNi16 "hi" +#define SCNdLEAST16 "hd" +#define SCNiLEAST16 "hi" +#define SCNdFAST16 "hd" +#define SCNiFAST16 "hi" + +#define SCNd32 "ld" +#define SCNi32 "li" +#define SCNdLEAST32 "ld" +#define SCNiLEAST32 "li" +#define SCNdFAST32 "ld" +#define SCNiFAST32 "li" + +#define SCNd64 "I64d" +#define SCNi64 "I64i" +#define SCNdLEAST64 "I64d" +#define SCNiLEAST64 "I64i" +#define SCNdFAST64 "I64d" +#define SCNiFAST64 "I64i" + +#define SCNdMAX "I64d" +#define SCNiMAX "I64i" + +#ifdef _WIN64 // [ +# define SCNdPTR "I64d" +# define SCNiPTR "I64i" +#else // _WIN64 ][ +# define SCNdPTR "ld" +# define SCNiPTR "li" +#endif // _WIN64 ] + +// The fscanf macros for unsigned integers are: +#define SCNo16 "ho" +#define SCNu16 "hu" +#define SCNx16 "hx" +#define SCNX16 "hX" +#define SCNoLEAST16 "ho" +#define SCNuLEAST16 "hu" +#define SCNxLEAST16 "hx" +#define SCNXLEAST16 "hX" +#define SCNoFAST16 "ho" +#define SCNuFAST16 "hu" +#define SCNxFAST16 "hx" +#define SCNXFAST16 "hX" + +#define SCNo32 "lo" +#define SCNu32 "lu" +#define SCNx32 "lx" +#define SCNX32 "lX" +#define SCNoLEAST32 "lo" +#define SCNuLEAST32 "lu" +#define SCNxLEAST32 "lx" +#define SCNXLEAST32 "lX" +#define SCNoFAST32 "lo" +#define SCNuFAST32 "lu" +#define SCNxFAST32 "lx" +#define SCNXFAST32 "lX" + +#define SCNo64 "I64o" +#define SCNu64 "I64u" +#define SCNx64 "I64x" +#define SCNX64 "I64X" +#define SCNoLEAST64 "I64o" +#define SCNuLEAST64 "I64u" +#define SCNxLEAST64 "I64x" +#define SCNXLEAST64 "I64X" +#define SCNoFAST64 "I64o" +#define SCNuFAST64 "I64u" +#define SCNxFAST64 "I64x" +#define SCNXFAST64 "I64X" + +#define SCNoMAX "I64o" +#define SCNuMAX "I64u" +#define SCNxMAX "I64x" +#define SCNXMAX "I64X" + +#ifdef _WIN64 // [ +# define SCNoPTR "I64o" +# define SCNuPTR "I64u" +# define SCNxPTR "I64x" +# define SCNXPTR "I64X" +#else // _WIN64 ][ +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" +# define SCNXPTR "lX" +#endif // _WIN64 ] + +#endif // __STDC_FORMAT_MACROS ] + +// 7.8.2 Functions for greatest-width integer types + +// 7.8.2.1 The imaxabs function +#define imaxabs _abs64 + +// 7.8.2.2 The imaxdiv function + +// This is modified version of div() function from Microsoft's div.c found +// in %MSVC.NET%\crt\src\div.c +#ifdef STATIC_IMAXDIV // [ +static +#else // STATIC_IMAXDIV ][ +_inline +#endif // STATIC_IMAXDIV ] +imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) +{ + imaxdiv_t result; + + result.quot = numer / denom; + result.rem = numer % denom; + + if (numer < 0 && result.rem > 0) { + // did division wrong; must fix up + ++result.quot; + result.rem -= denom; + } + + return result; +} + +// 7.8.2.3 The strtoimax and strtoumax functions +#define strtoimax _strtoi64 +#define strtoumax _strtoui64 + +// 7.8.2.4 The wcstoimax and wcstoumax functions +#define wcstoimax _wcstoi64 +#define wcstoumax _wcstoui64 + + +#endif // _MSC_INTTYPES_H_ ]

@@ -117,9 +117,15 @@ /* 7.18.2.4 Limits of integer types capable of holding object pointers */ +#if defined(_WIN64) || defined(__LP64__) +#define INTPTR_MIN INT64_MIN +#define INTPTR_MAX INT64_MAX +#define UINTPTR_MAX UINT64_MAX +#else #define INTPTR_MIN INT32_MIN #define INTPTR_MAX INT32_MAX #define UINTPTR_MAX UINT32_MAX +#endif /* 7.18.2.5 Limits of greatest-width integer types */ #define INTMAX_MIN INT64_MIN @@ -127,13 +133,24 @@ #define UINTMAX_MAX UINT64_MAX /* 7.18.3 Limits of other integer types */ +#if defined(_WIN64) || defined(__LP64__) +#define PTRDIFF_MIN INT64_MIN +#define PTRDIFF_MAX INT64_MAX +#else #define PTRDIFF_MIN INT32_MIN #define PTRDIFF_MAX INT32_MAX +#endif #define SIG_ATOMIC_MIN INT32_MIN #define SIG_ATOMIC_MAX INT32_MAX +#ifndef SIZE_MAX +#if defined(_WIN64) || defined(__LP64__) +#define SIZE_MAX UINT64_MAX +#else #define SIZE_MAX UINT32_MAX +#endif +#endif #ifndef WCHAR_MIN /* also in wchar.h */ #define WCHAR_MIN 0

@@ -27,7 +27,7 @@ #include "filters.h" #define RETURN_IF_ERROR( cond, ... ) RETURN_IF_ERR( cond, "options", NULL, __VA_ARGS__ ) -char **x264_split_string( char *string, char *sep, uint32_t limit ) +char **x264_split_string( char *string, char *sep, int limit ) { if( !string ) return NULL;

@@ -30,7 +30,7 @@ #include "x264cli.h" #include "filters/video/video.h" -char **x264_split_string( char *string, char *sep, uint32_t limit ); +char **x264_split_string( char *string, char *sep, int limit ); void x264_free_string_array( char **array ); char **x264_split_options( const char *opt_str, const char *options[] );

@@ -41,9 +41,8 @@ #if HAVE_SWSCALE #undef DECLARE_ALIGNED #include <libswscale/swscale.h> - -/* this function is not a part of the swscale API but is defined in swscale_internal.h */ -const char *sws_format_name( enum PixelFormat format ); +#include <libavutil/opt.h> +#include <libavutil/pixdesc.h> typedef struct { @@ -61,10 +60,11 @@ int buffer_allocated; int dst_csp; struct SwsContext *ctx; - int ctx_flags; + uint32_t ctx_flags; /* state of swapping chroma planes pre and post resize */ int pre_swap_chroma; int post_swap_chroma; + int variable_input; /* input is capable of changing properties */ frame_prop_t dst; /* desired output properties */ frame_prop_t scale; /* properties of the SwsContext input */ } resizer_hnd_t; @@ -98,16 +98,6 @@ " - area, bicublin, gauss, sinc, lanczos, spline\n" ); } -static uint32_t convert_cpu_to_flag( uint32_t cpu ) -{ - uint32_t swscale_cpu = 0; - if( cpu & X264_CPU_ALTIVEC ) - swscale_cpu |= SWS_CPU_CAPS_ALTIVEC; - if( cpu & X264_CPU_MMXEXT ) - swscale_cpu |= SWS_CPU_CAPS_MMX | SWS_CPU_CAPS_MMX2; - return swscale_cpu; -} - static uint32_t convert_method_to_flag( const char *name ) { uint32_t flag = 0; @@ -348,6 +338,54 @@ return 0; } +static int x264_init_sws_context( resizer_hnd_t *h ) +{ + if( !h->ctx ) + { + h->ctx = sws_alloc_context(); + if( !h->ctx ) + return -1; + + /* set flags that will not change */ + av_set_int( h->ctx, "sws_flags", h->ctx_flags ); + av_set_int( h->ctx, "dstw", h->dst.width ); + av_set_int( h->ctx, "dsth", h->dst.height ); + av_set_int( h->ctx, "dst_format", h->dst.pix_fmt ); + av_set_int( h->ctx, "dst_range", 0 ); /* FIXME: use the correct full range value */ + } + + av_set_int( h->ctx, "srcw", h->scale.width ); + av_set_int( h->ctx, "srch", h->scale.height ); + av_set_int( h->ctx, "src_format", h->scale.pix_fmt ); + av_set_int( h->ctx, "src_range", 0 ); /* FIXME: use the correct full range value */ + + /* FIXME: use the correct full range values + * FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) */ + sws_setColorspaceDetails( h->ctx, sws_getCoefficients( SWS_CS_DEFAULT ), 0, + sws_getCoefficients( SWS_CS_DEFAULT ), 0, 0, 1<<16, 1<<16 ); + + return sws_init_context( h->ctx, NULL, NULL ) < 0; +} + +static int check_resizer( resizer_hnd_t *h, cli_pic_t *in, int frame ) +{ + frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ) }; + if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) ) + return 0; + /* also warn if the resizer was initialized after the first frame */ + if( h->ctx || frame ) + x264_cli_log( NAME, X264_LOG_WARNING, "stream properties changed at pts %"PRId64"\n", in->pts ); + h->scale = input_prop; + if( !h->buffer_allocated ) + { + if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, h->dst.width, h->dst.height ) ) + return -1; + h->buffer_allocated = 1; + } + FAIL_IF_ERROR( x264_init_sws_context( h ), "swscale init failed\n" ) + return 0; +} + static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { /* if called for normalizing the csp to known formats and the format is not unknown, exit */ @@ -372,6 +410,8 @@ h->dst.height = info->height; if( !strcmp( opt_string, "normcsp" ) ) { + /* only in normalization scenarios is the input capable of changing properties */ + h->variable_input = 1; h->dst_csp = pick_closest_supported_csp( info->csp ); /* now fix the catch-all i420 choice if it does not allow for the current input resolution dimensions. */ if( h->dst_csp == X264_CSP_I420 && info->width&1 ) @@ -388,11 +428,10 @@ h->dst.width = param->i_width; h->dst.height = param->i_height; } - uint32_t method = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) ); + h->ctx_flags = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) ); x264_free_string_array( opts ); - h->ctx_flags = convert_cpu_to_flag( param->cpu ) | method; - if( method != SWS_FAST_BILINEAR ) + if( h->ctx_flags != SWS_FAST_BILINEAR ) h->ctx_flags |= SWS_FULL_CHR_H_INT | SWS_FULL_CHR_H_INP | SWS_ACCURATE_RND; h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp ); h->scale = h->dst; @@ -408,13 +447,13 @@ /* confirm swscale can support this conversion */ FAIL_IF_ERROR( src_pix_fmt == PIX_FMT_NONE && src_pix_fmt_inv != PIX_FMT_NONE, - "input colorspace %s with bit depth %d is not supported\n", sws_format_name( src_pix_fmt_inv ), + "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( src_pix_fmt_inv ), info->csp & X264_CSP_HIGH_DEPTH ? 16 : 8 ); - FAIL_IF_ERROR( !sws_isSupportedInput( src_pix_fmt ), "input colorspace %s is not supported\n", sws_format_name( src_pix_fmt ) ) + FAIL_IF_ERROR( !sws_isSupportedInput( src_pix_fmt ), "input colorspace %s is not supported\n", av_get_pix_fmt_name( src_pix_fmt ) ) FAIL_IF_ERROR( h->dst.pix_fmt == PIX_FMT_NONE && dst_pix_fmt_inv != PIX_FMT_NONE, - "input colorspace %s with bit depth %d is not supported\n", sws_format_name( dst_pix_fmt_inv ), + "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( dst_pix_fmt_inv ), h->dst_csp & X264_CSP_HIGH_DEPTH ? 16 : 8 ); - FAIL_IF_ERROR( !sws_isSupportedOutput( h->dst.pix_fmt ), "output colorspace %s is not supported\n", sws_format_name( h->dst.pix_fmt ) ) + FAIL_IF_ERROR( !sws_isSupportedOutput( h->dst.pix_fmt ), "output colorspace %s is not supported\n", av_get_pix_fmt_name( h->dst.pix_fmt ) ) FAIL_IF_ERROR( h->dst.height != info->height && info->interlaced, "swscale is not compatible with interlaced vertical resizing\n" ) /* confirm that the desired resolution meets the colorspace requirements */ @@ -426,8 +465,17 @@ x264_cli_log( NAME, X264_LOG_INFO, "resizing to %dx%d\n", h->dst.width, h->dst.height ); if( h->dst.pix_fmt != src_pix_fmt ) x264_cli_log( NAME, X264_LOG_WARNING, "converting from %s to %s\n", - sws_format_name( src_pix_fmt ), sws_format_name( h->dst.pix_fmt ) ); + av_get_pix_fmt_name( src_pix_fmt ), av_get_pix_fmt_name( h->dst.pix_fmt ) ); h->dst_csp |= info->csp & X264_CSP_VFLIP; // preserve vflip + + /* if the input is not variable, initialize the context */ + if( !h->variable_input ) + { + cli_pic_t input_pic = {{info->csp, info->width, info->height, 0}, 0}; + if( check_resizer( h, &input_pic, 0 ) ) + return -1; + } + /* finished initing, overwrite values */ info->csp = h->dst_csp; info->width = h->dst.width; @@ -441,35 +489,12 @@ return 0; } -static int check_resizer( resizer_hnd_t *h, cli_pic_t *in ) -{ - frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ) }; - if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) ) - return 0; - if( h->ctx ) - { - sws_freeContext( h->ctx ); - x264_cli_log( NAME, X264_LOG_WARNING, "stream properties changed at pts %"PRId64"\n", in->pts ); - } - h->scale = input_prop; - if( !h->buffer_allocated ) - { - if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, h->dst.width, h->dst.height ) ) - return -1; - h->buffer_allocated = 1; - } - h->ctx = sws_getContext( h->scale.width, h->scale.height, h->scale.pix_fmt, h->dst.width, - h->dst.height, h->dst.pix_fmt, h->ctx_flags, NULL, NULL, NULL ); - FAIL_IF_ERROR( !h->ctx, "swscale init failed\n" ) - return 0; -} - static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { resizer_hnd_t *h = handle; if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) ) return -1; - if( check_resizer( h, output ) ) + if( h->variable_input && check_resizer( h, output, frame ) ) return -1; if( h->pre_swap_chroma ) XCHG( uint8_t*, output->img.plane[1], output->img.plane[2] );

@@ -85,8 +85,13 @@ } if( !idx ) { - idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, &h->time, &e ); - fprintf( stderr, " \r" ); + if( opt->progress ) + { + idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, &h->time, &e ); + fprintf( stderr, " \r" ); + } + else + idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, NULL, NULL, &e ); FAIL_IF_ERROR( !idx, "could not create index\n" ) if( opt->index_file && FFMS_WriteIndex( opt->index_file, idx, &e ) ) x264_cli_log( "ffms", X264_LOG_WARNING, "could not write index file\n" );

@@ -40,6 +40,7 @@ int bit_depth; char *timebase; int seek; + int progress; } cli_input_opt_t; /* properties of the source given by the demuxer */

@@ -145,6 +145,7 @@ return -1; sscanf( opt->resolution, "%dx%d", &param->width, &param->height ); param->pix_fmt = opt->colorspace ? av_get_pix_fmt( opt->colorspace ) : PIX_FMT_YUV420P; + FAIL_IF_ERROR( param->pix_fmt == PIX_FMT_NONE, "unsupported colorspace: %s\n", opt->colorspace ); } /* specify the input format. this is helpful when lavf fails to guess */ @@ -158,7 +159,7 @@ FAIL_IF_ERROR( av_find_stream_info( h->lavf ) < 0, "could not find input stream info\n" ) int i = 0; - while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != CODEC_TYPE_VIDEO ) + while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != AVMEDIA_TYPE_VIDEO ) i++; FAIL_IF_ERROR( i == h->lavf->nb_streams, "could not find video stream\n" ) h->stream_id = i;

@@ -25,7 +25,6 @@ #include "input.h" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "timecode", __VA_ARGS__ ) -#include <math.h> typedef struct {

@@ -201,7 +201,7 @@ static int read_frame_internal( cli_pic_t *pic, y4m_hnd_t *h ) { - int slen = strlen( Y4M_FRAME_MAGIC ); + size_t slen = strlen( Y4M_FRAME_MAGIC ); int i = 0; char header[16];

@@ -30,8 +30,10 @@ #if HAVE_GF_MALLOC #undef malloc #undef free +#undef realloc #define malloc gf_malloc #define free gf_free +#define realloc gf_realloc #endif typedef struct @@ -49,6 +51,7 @@ int i_delay_frames; int b_dts_compress; int i_dts_compress_multiplier; + int i_data_size; } mp4_hnd_t; static void recompute_bitrate_mp4( GF_ISOFile *p_file, int i_track ) @@ -233,10 +236,27 @@ gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 ); } - p_mp4->p_sample->data = malloc( p_param->i_width * p_param->i_height * 3 / 2 ); + p_mp4->i_data_size = p_param->i_width * p_param->i_height * 3 / 2; + p_mp4->p_sample->data = malloc( p_mp4->i_data_size ); if( !p_mp4->p_sample->data ) + { + p_mp4->i_data_size = 0; return -1; + } + + return 0; +} +static int check_buffer( mp4_hnd_t *p_mp4, int needed_size ) +{ + if( needed_size > p_mp4->i_data_size ) + { + void *ptr = realloc( p_mp4->p_sample->data, needed_size ); + if( !ptr ) + return -1; + p_mp4->p_sample->data = ptr; + p_mp4->i_data_size = needed_size; + } return 0; } @@ -284,6 +304,8 @@ // SEI + if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + sei_size ) ) + return -1; memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, sei, sei_size ); p_mp4->p_sample->dataLength += sei_size; @@ -296,6 +318,8 @@ int64_t dts; int64_t cts; + if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + i_size ) ) + return -1; memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size ); p_mp4->p_sample->dataLength += i_size;

@@ -26,10 +26,6 @@ *****************************************************************************/ #include <ctype.h> -#include <stdlib.h> -#include <limits.h> -#include <math.h> - #include "common/common.h" #include "common/cpu.h" @@ -61,14 +57,16 @@ #define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions #define MAX_CPUS 10 // number of different combinations of cpu flags -typedef struct { +typedef struct +{ void *pointer; // just for detecting duplicates uint32_t cpu; uint32_t cycles; uint32_t den; } bench_t; -typedef struct { +typedef struct +{ char *name; bench_t vers[MAX_CPUS]; } bench_func_t; @@ -90,7 +88,7 @@ static inline uint32_t read_time(void) { uint32_t a = 0; -#if defined(__GNUC__) && (ARCH_X86 || ARCH_X86_64) +#if HAVE_X86_INLINE_ASM asm volatile( "rdtsc" :"=a"(a) ::"edx" ); #elif ARCH_PPC asm volatile( "mftb %0" : "=r" (a) ); @@ -419,6 +417,26 @@ } report( "pixel hadamard_ac :" ); + ok = 1; used_asm = 0; + if( pixel_asm.vsad != pixel_ref.vsad ) + { + for( int h = 2; h <= 32; h += 2 ) + { + int res_c, res_asm; + set_func_name( "vsad" ); + used_asm = 1; + res_c = call_c( pixel_c.vsad, pbuf1, 16, h ); + res_asm = call_a( pixel_asm.vsad, pbuf1, 16, h ); + if( res_c != res_asm ) + { + ok = 0; + fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm ); + break; + } + } + } + report( "pixel vsad :" ); + #define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ @@ -532,7 +550,7 @@ x264_dct_function_t dct_ref; x264_dct_function_t dct_asm; x264_quant_function_t qf; - int ret = 0, ok, used_asm, interlace; + int ret = 0, ok, used_asm, interlace = 0; ALIGNED_16( dctcoef dct1[16][16] ); ALIGNED_16( dctcoef dct2[16][16] ); ALIGNED_16( dctcoef dct4[16][16] ); @@ -697,21 +715,21 @@ TEST_DCTDC( idct4x4dc ); #undef TEST_DCTDC - x264_zigzag_function_t zigzag_c; - x264_zigzag_function_t zigzag_ref; - x264_zigzag_function_t zigzag_asm; + x264_zigzag_function_t zigzag_c[2]; + x264_zigzag_function_t zigzag_ref[2]; + x264_zigzag_function_t zigzag_asm[2]; ALIGNED_16( dctcoef level1[64] ); ALIGNED_16( dctcoef level2[64] ); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy(dct, buf1, size*sizeof(dctcoef)); \ - call_c( zigzag_c.name, t1, dct ); \ - call_a( zigzag_asm.name, t2, dct ); \ + call_c( zigzag_c[interlace].name, t1, dct ); \ + call_a( zigzag_asm[interlace].name, t2, dct ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \ { \ ok = 0; \ @@ -720,26 +738,26 @@ } #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ - nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3 ); \ - nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4 ); \ + nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ + nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ - call_c2( zigzag_c.name, t1, pbuf2, pbuf3 ); \ - call_a2( zigzag_asm.name, t2, pbuf2, pbuf4 ); \ + call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ + call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ } #define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ dctcoef dc_a, dc_c; \ @@ -754,8 +772,8 @@ memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \ memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \ } \ - nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \ - nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \ + nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ + nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \ { \ ok = 0; \ @@ -763,12 +781,12 @@ break; \ } \ } \ - call_c2( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \ - call_a2( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \ + call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ + call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ } #define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ for( int j = 0; j < 100; j++ ) \ { \ @@ -778,8 +796,8 @@ for( int i = 0; i < size; i++ ) \ dct[i] = rand()&0x1F ? 0 : dct[i]; \ memcpy(buf3, buf4, 10); \ - call_c( zigzag_c.name, t1, dct, buf3 ); \ - call_a( zigzag_asm.name, t2, dct, buf4 ); \ + call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \ + call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \ { \ ok = 0; \ @@ -787,33 +805,23 @@ } \ } - interlace = 0; - x264_zigzag_init( 0, &zigzag_c, 0 ); - x264_zigzag_init( cpu_ref, &zigzag_ref, 0 ); - x264_zigzag_init( cpu_new, &zigzag_asm, 0 ); - - ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); - TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); - report( "zigzag_frame :" ); - - interlace = 1; - x264_zigzag_init( 0, &zigzag_c, 1 ); - x264_zigzag_init( cpu_ref, &zigzag_ref, 1 ); - x264_zigzag_init( cpu_new, &zigzag_asm, 1 ); - - ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); - TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); - report( "zigzag_field :" ); + x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] ); + x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] ); + x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] ); ok = 1; used_asm = 0; TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 ); report( "zigzag_interleave :" ); + + for( interlace = 0; interlace <= 1; interlace++ ) + { + ok = 1; used_asm = 0; + TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); + TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); + TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); + TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); + report( interlace ? "zigzag_field :" : "zigzag_frame :" ); + } #undef TEST_ZIGZAG_SCAN #undef TEST_ZIGZAG_SUB @@ -1247,8 +1255,8 @@ int *dstc = dsta+400; uint16_t *prop = (uint16_t*)buf1; uint16_t *intra = (uint16_t*)buf4; - uint16_t *inter = intra+100; - uint16_t *qscale = inter+100; + uint16_t *inter = intra+128; + uint16_t *qscale = inter+128; uint16_t *rnd = (uint16_t*)buf2; x264_emms(); for( int j = 0; j < 100; j++ ) @@ -1268,6 +1276,44 @@ report( "mbtree propagate :" ); } + if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned ) + { + set_func_name( "memcpy_aligned" ); + ok = 1; used_asm = 1; + for( int size = 16; size < 256; size += 16 ) + { + memset( buf4, 0xAA, size + 1 ); + call_c( mc_c.memcpy_aligned, buf3, buf1, size ); + call_a( mc_a.memcpy_aligned, buf4, buf1, size ); + if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) + { + ok = 0; + fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size ); + break; + } + } + report( "memcpy aligned :" ); + } + + if( mc_a.memzero_aligned != mc_ref.memzero_aligned ) + { + set_func_name( "memzero_aligned" ); + ok = 1; used_asm = 1; + for( int size = 128; size < 1024; size += 128 ) + { + memset( buf4, 0xAA, size + 1 ); + call_c( mc_c.memzero_aligned, buf3, size ); + call_a( mc_a.memzero_aligned, buf4, size ); + if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) + { + ok = 0; + fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size ); + break; + } + } + report( "memzero aligned :" ); + } + return ret; } @@ -1280,9 +1326,9 @@ int alphas[36], betas[36]; int8_t tcs[36][4]; - x264_deblock_init( 0, &db_c ); - x264_deblock_init( cpu_ref, &db_ref ); - x264_deblock_init( cpu_new, &db_a ); + x264_deblock_init( 0, &db_c, 0 ); + x264_deblock_init( cpu_ref, &db_ref, 0 ); + x264_deblock_init( cpu_new, &db_a, 0 ); /* not exactly the real values of a,b,tc but close enough */ for( int i = 35, a = 255, c = 250; i >= 0; i-- ) @@ -1337,7 +1383,8 @@ ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] ); ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] ); - ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][4][4] ); + ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][8][4] ); + memset( bs, 99, sizeof(bs) ); for( int j = 0; j < X264_SCAN8_SIZE; j++ ) nnz[j] = ((rand()&7) == 7) * rand() & 0xf; for( int j = 0; j < 2; j++ ) @@ -1348,8 +1395,8 @@ mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512; } set_func_name( "deblock_strength" ); - call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) ); - call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) ); + call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1), NULL ); + call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1), NULL ); if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) ) { ok = 0; @@ -1384,7 +1431,7 @@ ALIGNED_16( dctcoef dct2[64] ); ALIGNED_16( uint8_t cqm_buf[64] ); int ret = 0, ok, used_asm; - int oks[2] = {1,1}, used_asms[2] = {0,0}; + int oks[3] = {1,1,1}, used_asms[3] = {0,0,0}; x264_t h_buf; x264_t *h = &h_buf; memset( h, 0, sizeof(*h) ); @@ -1558,6 +1605,41 @@ TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 ); +#define TEST_OPTIMIZE_CHROMA_DC( qname, optname, w ) \ + if( qf_a.optname != qf_ref.optname ) \ + { \ + set_func_name( #optname ); \ + used_asms[2] = 1; \ + for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ + { \ + int dmf = h->dequant4_mf[CQM_4IC][qp%6][0] << qp/6; \ + if( dmf > 32*64 ) \ + continue; \ + for( int i = 16; ; i <<= 1 )\ + { \ + int res_c, res_asm; \ + int max = X264_MIN( i, PIXEL_MAX*16 ); \ + for( int j = 0; j < w*w; j++ ) \ + dct1[j] = rand()%(max*2+1) - max; \ + call_c1( qf_c.qname, dct1, h->quant4_mf[CQM_4IC][qp][0]>>1, h->quant4_bias[CQM_4IC][qp][0]>>1 ); \ + memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ + res_c = call_c1( qf_c.optname, dct1, dmf ); \ + res_asm = call_a1( qf_a.optname, dct2, dmf ); \ + if( res_c != res_asm || memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \ + { \ + oks[2] = 0; \ + fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \ + } \ + call_c2( qf_c.optname, dct1, dmf ); \ + call_a2( qf_a.optname, dct2, dmf ); \ + if( i >= PIXEL_MAX*16 ) \ + break; \ + } \ + } \ + } + + TEST_OPTIMIZE_CHROMA_DC( quant_2x2_dc, optimize_chroma_dc, 2 ); + x264_cqm_delete( h ); } @@ -1567,6 +1649,9 @@ ok = oks[1]; used_asm = used_asms[1]; report( "dequant :" ); + ok = oks[2]; used_asm = used_asms[2]; + report( "optimize chroma dc :" ); + ok = 1; used_asm = 0; if( qf_a.denoise_dct != qf_ref.denoise_dct ) { @@ -1858,6 +1943,7 @@ int ret = 0, ok, used_asm = 1; if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm ) return 0; + x264_cabac_init(); set_func_name( "cabac_encode_decision" ); memcpy( buf4, buf3, 0x1000 );

@@ -35,6 +35,7 @@ ("", "--intra-refresh"), ("", "--no-cabac"), ("", "--interlaced"), + ("", "--slice-max-size 1000"), ("", "--frame-packing 5"), [ "--preset %s" % p for p in ("ultrafast", "superfast", @@ -260,6 +261,7 @@ ffmpeg_proc = Popen([ "ffmpeg", + "-vsync 0", "-i", "%s.264" % self.fixture.dispatcher.video, "ffmpeg-output.yuv"

@@ -27,13 +27,9 @@ * For more information, contact us at licensing@x264.com. *****************************************************************************/ -#include <stdlib.h> -#include <math.h> - #include <signal.h> #define _GNU_SOURCE #include <getopt.h> - #include "common/common.h" #include "x264cli.h" #include "input/input.h" @@ -74,6 +70,8 @@ b_ctrl_c = 1; } +static char UNUSED originalCTitle[200] = ""; + typedef struct { int b_progress; int i_seek; @@ -124,7 +122,8 @@ static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 }; static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 }; -typedef struct{ +typedef struct +{ int mod; uint8_t pattern[24]; float fps_factor; @@ -227,10 +226,12 @@ printf( "(ffmpegsource %d.%d.%d.%d)\n", FFMS_VERSION >> 24, (FFMS_VERSION & 0xff0000) >> 16, (FFMS_VERSION & 0xff00) >> 8, FFMS_VERSION & 0xff ); #endif printf( "built on " __DATE__ ", " ); -#ifdef __GNUC__ +#ifdef __INTEL_COMPILER + printf( "intel: %.2f (%d)\n", __INTEL_COMPILER / 100.f, __INTEL_COMPILER_BUILD_DATE ); +#elif defined(__GNUC__) printf( "gcc: " __VERSION__ "\n" ); #else - printf( "using a non-gcc compiler\n" ); + printf( "using an unknown compiler\n" ); #endif printf( "configuration: --bit-depth=%d\n", x264_bit_depth ); printf( "x264 license: " ); @@ -262,10 +263,15 @@ _setmode(_fileno(stdout), _O_BINARY); #endif + GetConsoleTitle( originalCTitle, sizeof(originalCTitle) ); + /* Parse command line */ if( parse( argc, argv, &param, &opt ) < 0 ) ret = -1; + /* Restore title; it can be changed by input modules */ + SetConsoleTitle( originalCTitle ); + /* Control-C handler */ signal( SIGINT, sigint_handler ); @@ -284,6 +290,8 @@ if( opt.qpfile ) fclose( opt.qpfile ); + SetConsoleTitle( originalCTitle ); + return ret; } @@ -323,11 +331,11 @@ printf( "\n" ); printf( " - valid csps for `lavf' demuxer:\n" ); printf( INDENT ); - int line_len = strlen( INDENT ); + size_t line_len = strlen( INDENT ); for( enum PixelFormat i = PIX_FMT_NONE+1; i < PIX_FMT_NB; i++ ) { const char *pfname = av_pix_fmt_descriptors[i].name; - int name_len = strlen( pfname ); + size_t name_len = strlen( pfname ); if( line_len + name_len > (80 - strlen( ", " )) ) { printf( "\n" INDENT ); @@ -533,11 +541,7 @@ " - strict: Strictly hierarchical pyramid\n" " - normal: Non-strict (not Blu-ray compatible)\n", strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) ); - H1( " --open-gop <string> Use recovery points to close GOPs [none]\n" - " - none: closed GOPs only\n" - " - normal: standard open GOPs\n" - " (not Blu-ray compatible)\n" - " - bluray: Blu-ray-compatible open GOPs\n" + H1( " --open-gop Use recovery points to close GOPs\n" " Only available with b-frames\n" ); H1( " --no-cabac Disable CABAC\n" ); H1( " -r, --ref <integer> Number of reference frames [%d]\n", defaults->i_frame_reference ); @@ -733,6 +737,7 @@ H0( " --seek <integer> First frame to encode\n" ); H0( " --frames <integer> Maximum number of frames to encode\n" ); H0( " --level <string> Specify level (as defined by Annex A)\n" ); + H1( " --bluray-compat Enable compatibility hacks for Blu-ray support\n" ); H1( "\n" ); H1( " -v, --verbose Print stats for each frame\n" ); H1( " --no-progress Don't show the progress indicator while encoding\n" ); @@ -823,7 +828,8 @@ { "no-b-adapt", no_argument, NULL, 0 }, { "b-bias", required_argument, NULL, 0 }, { "b-pyramid", required_argument, NULL, 0 }, - { "open-gop", required_argument, NULL, 0 }, + { "open-gop", no_argument, NULL, 0 }, + { "bluray-compat", no_argument, NULL, 0 }, { "min-keyint", required_argument, NULL, 'i' }, { "keyint", required_argument, NULL, 'I' }, { "intra-refresh", no_argument, NULL, 0 }, @@ -1393,6 +1399,8 @@ info.tff = param->b_tff; info.vfr = param->b_vfr_input; + input_opt.progress = opt->b_progress; + if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) ) return -1; @@ -1488,11 +1496,15 @@ if( !b_user_interlaced && info.interlaced ) { +#if HAVE_INTERLACED x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, enabling %cff interlaced mode.\n" " If you want otherwise, use --no-interlaced or --%cff\n", info.tff ? 't' : 'b', info.tff ? 'b' : 't' ); param->b_interlaced = 1; param->b_tff = !!info.tff; +#else + x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, but not compiled with interlaced support\n" ); +#endif } /* Automatically reduce reference frame count to match the user's target level @@ -1644,9 +1656,6 @@ double duration; double pulldown_pts = 0; int retval = 0; - char UNUSED originalCTitle[200] = ""; - - GetConsoleTitle( originalCTitle, sizeof(originalCTitle) ); opt->b_progress &= param->i_log_level < X264_LOG_DEBUG; @@ -1805,7 +1814,5 @@ (double) i_file * 8 / ( 1000 * duration ) ); } - SetConsoleTitle( originalCTitle ); - return retval; }

@@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 114 +#define X264_BUILD 115 /* x264_t: * opaque handler for encoder */ @@ -162,9 +162,6 @@ #define X264_B_PYRAMID_NORMAL 2 #define X264_KEYINT_MIN_AUTO 0 #define X264_KEYINT_MAX_INFINITE (1<<30) -#define X264_OPEN_GOP_NONE 0 -#define X264_OPEN_GOP_NORMAL 1 -#define X264_OPEN_GOP_BLURAY 2 static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 }; static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 }; @@ -176,7 +173,6 @@ static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 }; static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 }; static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; -static const char * const x264_open_gop_names[] = { "none", "normal", "bluray", 0 }; /* Colorspace type */ #define X264_CSP_MASK 0x00ff /* */ @@ -281,7 +277,8 @@ int i_bframe_adaptive; int i_bframe_bias; int i_bframe_pyramid; /* Keep some B-frames as references: 0=off, 1=strict hierarchical, 2=normal */ - int i_open_gop; /* Open gop: 1=display order, 2=bluray compatibility braindamage mode */ + int b_open_gop; + int b_bluray_compat; int b_deblocking_filter; int i_deblocking_filter_alphac0; /* [-6, 6] -6 light filter, 6 strong */ @@ -385,7 +382,8 @@ /* Cropping Rectangle parameters: added to those implicitly defined by non-mod16 video resolutions. */ - struct { + struct + { unsigned int i_left; unsigned int i_top; unsigned int i_right; @@ -480,7 +478,8 @@ * H.264 level restriction information ****************************************************************************/ -typedef struct { +typedef struct +{ int level_idc; int mbps; /* max macroblock processing rate (macroblocks/sec) */ int frame_size; /* max frame size (macroblocks) */

@@ -34,7 +34,7 @@ typedef void *hnd_t; -static inline int64_t gcd( int64_t a, int64_t b ) +static inline uint64_t gcd( uint64_t a, uint64_t b ) { while( 1 ) { @@ -46,7 +46,7 @@ } } -static inline int64_t lcm( int64_t a, int64_t b ) +static inline uint64_t lcm( uint64_t a, uint64_t b ) { return ( a / gcd( a, b ) ) * b; }

[-] [+]	Changed	x264.spec
@@ -1,19 +1,18 @@ # norootforbuild -%define soname 114 -%define svn 20110225 +%define soname 115 +%define svn 20110622 %define realname libx264 Name: x264 Summary: A free h264/avc encoder - encoder binary Version: 0.%{soname}svn%{svn} -Release: 3 +Release: 2 License: GPLv2+ Group: Productivity/Multimedia/Video/Editors and Convertors URL: http://developers.videolan.org/x264.html Source0: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 -Patch0: x264-rewrite-ffmpeg-defaults.patch -Patch1: x264-use-shared-library.patch +Patch0: x264-use-shared-library.patch BuildRoot: %{_tmppath}/build-root-%{name} BuildRequires: nasm BuildRequires: yasm >= 1.0.1 @@ -86,8 +85,9 @@ %prep %setup -q -n "x264-snapshot-%{svn}-2245" -%patch0 -p1 -%patch1 -p1 +%patch0 -p0 +FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y') +sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c %build %configure --enable-shared --enable-pic @@ -113,7 +113,7 @@ %files -n %{realname}-%{soname} %defattr(0644,root,root) -%attr(0755,root,root) %{_libdir}/libx264.so.%{soname} +%{_libdir}/libx264.so.%{soname} %files -n %{realname}-devel %defattr(0644,root,root)
[-] [+]	Deleted	x264-rewrite-ffmpeg-defaults.patch ^
@@ -1,18 +0,0 @@ -diff -ur x264-snapshot-20110225-2245.orig/encoder/encoder.c x264-snapshot-20110225-2245/encoder/encoder.c ---- x264-snapshot-20110225-2245.orig/encoder/encoder.c 2011-02-25 22:45:04.000000000 +0100 -+++ x264-snapshot-20110225-2245/encoder/encoder.c 2011-02-26 14:24:02.144281162 +0100 -@@ -487,12 +487,8 @@ - score += h->param.analyse.inter == 0 && h->param.analyse.i_subpel_refine == 8; - if( score >= 5 ) - { -- x264_log( h, X264_LOG_ERROR, "broken ffmpeg default settings detected\n" ); -- x264_log( h, X264_LOG_ERROR, "use an encoding preset (e.g. -vpre medium)\n" ); -- x264_log( h, X264_LOG_ERROR, "preset usage: -vpre <speed> -vpre <profile>\n" ); -- x264_log( h, X264_LOG_ERROR, "speed presets are listed in x264 --help\n" ); -- x264_log( h, X264_LOG_ERROR, "profile is optional; x264 defaults to high\n" ); -- return -1; -+ /* broken ffmpeg defaults, set to h264 defaults */ -+ x264_param_default( &h->param ); - } - } -
[-] [+]	Changed	x264-use-shared-library.patch ^
@@ -1,31 +1,15 @@ -diff -ur x264-snapshot-20110225-2245.orig/Makefile x264-snapshot-20110225-2245/Makefile ---- x264-snapshot-20110225-2245.orig/Makefile 2011-02-25 22:45:04.000000000 +0100 -+++ x264-snapshot-20110225-2245/Makefile 2011-02-26 14:25:51.568295374 +0100 -@@ -145,9 +145,10 @@ +--- Makefile.orig 2011-05-27 22:45:04.000000000 +0200 ++++ Makefile 2011-05-28 15:18:29.883305471 +0200 +@@ -149,9 +149,10 @@ $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) - $(CC) -shared -o $@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + ln -s $(SONAME) libx264.so --x264$(EXE): $(OBJCLI) libx264.a -- $(CC) -o $@ $+ $(LDFLAGSCLI) $(LDFLAGS) -+x264$(EXE): $(OBJCLI) $(SONAME) -+ $(CC) -o $@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS) +-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) +- $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) ++x264$(EXE): .depend $(OBJCLI) $(SONAME) ++ $(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS) - checkasm: tools/checkasm.o libx264.a - $(CC) -o $@ $+ $(LDFLAGS) -@@ -219,10 +220,12 @@ - install -d $(DESTDIR)$(libdir)/pkgconfig - install -m 644 x264.h $(DESTDIR)$(includedir) - install -m 644 x264_config.h $(DESTDIR)$(includedir) -- install -m 644 libx264.a $(DESTDIR)$(libdir) - install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig - install x264$(EXE) $(DESTDIR)$(bindir) -- $(RANLIB) $(DESTDIR)$(libdir)/libx264.a -+ if [ -e libx264.a ]; then \ -+ install -m 644 libx264.a $(DESTDIR)$(libdir); \ -+ $(RANLIB) $(DESTDIR)$(libdir)/libx264.a; \ -+ fi - ifeq ($(SYS),MINGW) - $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir)) - else + checkasm: tools/checkasm.o $(LIBX264) + $(LD)$@ $+ $(LDFLAGS)
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/.gitignore ^
@@ -5,6 +5,9 @@ .rej .dll* .exe +.def +.lib +.pdb .mo .o *.patch
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/Makefile ^
@@ -125,7 +125,7 @@ endif ifneq ($(SONAME),) -ifeq ($(SYS),MINGW) +ifeq ($(SYS),WINDOWS) SRCSO += x264dll.c endif endif @@ -135,34 +135,38 @@ OBJSO = $(SRCSO:%.c=%.o) DEP = depend -.PHONY: all default fprofiled clean distclean install uninstall dox test testclean +.PHONY: all default fprofiled clean distclean install uninstall dox test testclean lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli -default: $(DEP) x264$(EXE) +default: $(DEP) -libx264.a: .depend $(OBJS) $(OBJASM) - $(AR) rc libx264.a $(OBJS) $(OBJASM) - $(RANLIB) libx264.a +cli: x264$(EXE) +lib-static: $(LIBX264) +lib-shared: $(SONAME) + +$(LIBX264): .depend $(OBJS) $(OBJASM) + $(AR)$@ $(OBJS) $(OBJASM) + $(if $(RANLIB), $(RANLIB) $@) $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) - $(CC) -shared -o $@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) -x264$(EXE): $(OBJCLI) libx264.a - $(CC) -o $@ $+ $(LDFLAGSCLI) $(LDFLAGS) +x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) + $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) -checkasm: tools/checkasm.o libx264.a - $(CC) -o $@ $+ $(LDFLAGS) +checkasm: tools/checkasm.o $(LIBX264) + $(LD)$@ $+ $(LDFLAGS) %.o: %.asm $(AS) $(ASFLAGS) -o $@ $< - -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile + -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %.o: %.S $(AS) $(ASFLAGS) -o $@ $< - -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile + -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile .depend: config.mak @rm -f .depend - @$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) -MT $(SRC:%.c=%.o) -MM -g0 1>> .depend;) + @$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:%.c=%.o) $(DEPMM) 1>> .depend;) config.mak: ./configure @@ -191,39 +195,40 @@ else fprofiled: $(MAKE) clean - mv config.mak config.mak2 - sed -e 's/CFLAGS./& -fprofile-generate/; s/LDFLAGS./& -fprofile-generate/' config.mak2 > config.mak - $(MAKE) x264$(EXE) + $(MAKE) x264$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)" $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;)) rm -f $(SRC2:%.c=%.o) - sed -e 's/CFLAGS./& -fprofile-use/; s/LDFLAGS./& -fprofile-use/' config.mak2 > config.mak - $(MAKE) - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) - mv config.mak2 config.mak + $(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)" + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) .dyn pgopti.dpi pgopti.dpi.lock endif clean: - rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) .a x264 x264.exe .depend TAGS + rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) .a .lib .exp .pdb x264 x264.exe .depend TAGS rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) - - sed -e 's/ -fprofile-\(generate\\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) .dyn pgopti.dpi pgopti.dpi.lock distclean: clean - rm -f config.mak x264_config.h config.h config.log x264.pc + rm -f config.mak x264_config.h config.h config.log x264.pc x264.def rm -rf test/ -install: x264$(EXE) $(SONAME) +install-cli: cli install -d $(DESTDIR)$(bindir) + install x264$(EXE) $(DESTDIR)$(bindir) + +install-lib-dev: install -d $(DESTDIR)$(includedir) install -d $(DESTDIR)$(libdir) install -d $(DESTDIR)$(libdir)/pkgconfig install -m 644 x264.h $(DESTDIR)$(includedir) install -m 644 x264_config.h $(DESTDIR)$(includedir) - install -m 644 libx264.a $(DESTDIR)$(libdir) install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig - install x264$(EXE) $(DESTDIR)$(bindir) - $(RANLIB) $(DESTDIR)$(libdir)/libx264.a -ifeq ($(SYS),MINGW) + +install-lib-static: lib-static install-lib-dev + install -m 644 $(LIBX264) $(DESTDIR)$(libdir) + $(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264)) + +install-lib-shared: lib-shared install-lib-dev +ifeq ($(SYS),WINDOWS) $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir)) else $(if $(SONAME), ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX))
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/arm/mc-a.S ^
@@ -106,17 +106,21 @@ vst1.64 {d0-d1}, [r3,:r3align]! 32: // n is a multiple of 32 tst r2, #32 - beq 64f + beq 640f sub r2, #32 vld1.64 {d0-d3}, [r1,:r1align]! vst1.64 {d0-d3}, [r3,:r3align]! -64: // n is a multiple of 64 +640: // n is a multiple of 64 + cmp r2, #0 + beq 1f +64: subs r2, #64 vld1.64 {d0-d3}, [r1,:r1align]! vld1.64 {d4-d7}, [r1,:r1align]! vst1.64 {d0-d3}, [r3,:r3align]! vst1.64 {d4-d7}, [r3,:r3align]! bgt 64b +1: // end .if \srcalign == 8 && \dstalign == 8 vld1.64 {d0}, [r1,:64]! vst1.64 {d0}, [r3,:64]!
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/cabac.c ^
@@ -708,11 +708,12 @@ {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125} }; -const uint8_t x264_cabac_renorm_shift[64]= { - 6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +const uint8_t x264_cabac_renorm_shift[64] = +{ + 6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; /* -ln2(probability) / @@ -752,24 +753,29 @@ FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000) }; +uint8_t x264_cabac_contexts[4][QP_MAX_SPEC+1][460]; + +void x264_cabac_init( void ) +{ + for( int i = 0; i < 4; i++ ) + { + const int8_t (cabac_context_init)[460][2] = i == 0 ? &x264_cabac_context_init_I + : &x264_cabac_context_init_PB[i-1]; + for( int qp = 0; qp <= QP_MAX_SPEC; qp++ ) + for( int j = 0; j < 460; j++ ) + { + int state = x264_clip3( (((cabac_context_init)[j][0] qp) >> 4) + (cabac_context_init)[j][1], 1, 126 ); + x264_cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) \| (state >> 6); + } + } +} /**************************************************************************** * ****************************************************************************/ void x264_cabac_context_init( x264_cabac_t cb, int i_slice_type, int i_qp, int i_model ) { - const int8_t (cabac_context_init)[460][2]; - - if( i_slice_type == SLICE_TYPE_I ) - cabac_context_init = &x264_cabac_context_init_I; - else - cabac_context_init = &x264_cabac_context_init_PB[i_model]; - - for( int i = 0; i < 460; i++ ) - { - int state = x264_clip3( (((cabac_context_init)[i][0] * i_qp) >> 4) + (cabac_context_init)[i][1], 1, 126 ); - cb->state[i] = (X264_MIN( state, 127-state ) << 1) \| (state >> 6); - } + memcpy( cb->state, x264_cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], 460 ); } void x264_cabac_encode_init_core( x264_cabac_t cb ) @@ -846,10 +852,11 @@ x264_cabac_encode_renorm( cb ); } +/* Note: b is negated for this function / void x264_cabac_encode_bypass_c( x264_cabac_t cb, int b ) { cb->i_low <<= 1; - cb->i_low += -b & cb->i_range; + cb->i_low += b & cb->i_range; cb->i_queue += 1; x264_cabac_putbyte( cb ); }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/common.c ^
@@ -443,6 +443,7 @@ param->analyse.b_transform_8x8 = 0; param->b_cabac = 0; param->i_cqm_preset = X264_CQM_FLAT; + param->psz_cqm_file = NULL; param->i_bframe = 0; param->analyse.i_weighted_pred = X264_WEIGHTP_NONE; if( param->b_interlaced ) @@ -460,6 +461,7 @@ { param->analyse.b_transform_8x8 = 0; param->i_cqm_preset = X264_CQM_FLAT; + param->psz_cqm_file = NULL; } else if( !strcasecmp( profile, "high" ) \|\| !strcasecmp( profile, "high10" ) ) { @@ -621,6 +623,8 @@ else p->i_level_idc = atoi(value); } + OPT("bluray-compat") + p->b_bluray_compat = atobool(value); OPT("sar") { b_error = ( 2 != sscanf( value, "%d:%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) && @@ -705,14 +709,7 @@ } } OPT("open-gop") - { - b_error \|= parse_enum( value, x264_open_gop_names, &p->i_open_gop ); - if( b_error ) - { - b_error = 0; - p->i_open_gop = atoi(value); - } - } + p->b_open_gop = atobool(value); OPT("nf") p->b_deblocking_filter = !atobool(value); OPT2("filter", "deblock") @@ -1095,7 +1092,7 @@ void x264_malloc( int i_size ) { uint8_t align_buf = NULL; -#if SYS_MACOSX \|\| (SYS_MINGW && ARCH_X86_64) +#if SYS_MACOSX \|\| (SYS_WINDOWS && ARCH_X86_64) /* Mac OS X and Win x64 always returns 16 byte aligned memory / align_buf = malloc( i_size ); #elif HAVE_MALLOC_H @@ -1121,7 +1118,7 @@ { if( p ) { -#if HAVE_MALLOC_H \|\| SYS_MACOSX \|\| (SYS_MINGW && ARCH_X86_64) +#if HAVE_MALLOC_H \|\| SYS_MACOSX \|\| (SYS_WINDOWS && ARCH_X86_64) free( p ); #else free( ( ( ( void *) p ) - 1 ) ); @@ -1160,7 +1157,7 @@ char x264_slurp_file( const char filename ) { int b_error = 0; - int i_size; + size_t i_size; char buf; FILE *fh = fopen( filename, "rb" ); if( !fh ) @@ -1240,6 +1237,7 @@ s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction ); s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate ); s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" ); + s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat ); s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra ); @@ -1248,7 +1246,7 @@ { s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d open_gop=%d", p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, - p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->i_open_gop ); + p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->b_open_gop ); } s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/common.h ^
@@ -92,6 +92,16 @@ #include <assert.h> #include <limits.h> +#if HAVE_INTERLACED +# define MB_INTERLACED h->mb.b_interlaced +# define SLICE_MBAFF h->sh.b_mbaff +# define PARAM_INTERLACED h->param.b_interlaced +#else +# define MB_INTERLACED 0 +# define SLICE_MBAFF 0 +# define PARAM_INTERLACED 0 +#endif + /* Unions for type-punning. * Mn: load or store n bits, aligned, native-endian * CPn: copy n bits, aligned, native-endian @@ -137,7 +147,7 @@ #define X264_SCAN8_LUMA_SIZE (58) #define X264_SCAN8_0 (4+18) -static const int x264_scan8[16+24+3] = +static const unsigned x264_scan8[16+24+3] = { /* Luma / 4+18, 5+18, 4+28, 5+28, @@ -205,7 +215,8 @@ void x264_reduce_fraction( uint32_t n, uint32_t d ); void x264_reduce_fraction64( uint64_t n, uint64_t d ); -void x264_init_vlc_tables( void ); +void x264_cavlc_init( void ); +void x264_cabac_init( void ); static ALWAYS_INLINE pixel x264_clip_pixel( int x ) { @@ -310,6 +321,7 @@ SEI_USER_DATA_REGISTERED = 4, SEI_USER_DATA_UNREGISTERED = 5, SEI_RECOVERY_POINT = 6, + SEI_DEC_REF_PIC_MARKING = 7, SEI_FRAME_PACKING = 45, }; @@ -392,6 +404,15 @@ typedef struct x264_ratecontrol_t x264_ratecontrol_t; +typedef struct x264_left_table_t +{ + uint8_t intra[4]; + uint8_t nnz[4]; + uint8_t nnz_chroma[4]; + uint8_t mv[4]; + uint8_t ref[4]; +} x264_left_table_t; + struct x264_t { / encoder parameters / @@ -473,6 +494,10 @@ / Slice header / x264_slice_header_t sh; + / Slice header backup, for SEI_DEC_REF_PIC_MARKING / + int b_sh_backup; + x264_slice_header_t sh_backup; + / cabac context / x264_cabac_t cabac; @@ -549,6 +574,8 @@ int i_mb_stride; int i_b8_stride; int i_b4_stride; + int left_b8[2]; + int left_b4[2]; / Current index / int i_mb_x; @@ -568,17 +595,24 @@ int i_psy_trellis; / Psy trellis strength--fixed point value/ int b_interlaced; + int b_adaptive_mbaff; / MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs / / Allowed qpel MV range to stay within the picture + emulated edge pixels / int mv_min[2]; int mv_max[2]; + int mv_miny_row[3]; / 0 == top progressive, 1 == bot progressive, 2 == interlaced / + int mv_maxy_row[3]; / Subpel MV range for motion search. * same mv_min/max but includes levels' i_mv_range. / int mv_min_spel[2]; int mv_max_spel[2]; + int mv_miny_spel_row[3]; + int mv_maxy_spel_row[3]; / Fullpel MV range for motion search / int mv_min_fpel[2]; int mv_max_fpel[2]; + int mv_miny_fpel_row[3]; + int mv_maxy_fpel_row[3]; / neighboring MBs / unsigned int i_neighbour; @@ -587,14 +621,22 @@ unsigned int i_neighbour_intra; / for constrained intra pred / unsigned int i_neighbour_frame; / ignoring slice boundaries / int i_mb_type_top; - int i_mb_type_left; + int i_mb_type_left[2]; int i_mb_type_topleft; int i_mb_type_topright; int i_mb_prev_xy; - int i_mb_left_xy; + int i_mb_left_xy[2]; int i_mb_top_xy; int i_mb_topleft_xy; int i_mb_topright_xy; + int i_mb_top_y; + int i_mb_topleft_y; + int i_mb_topright_y; + const x264_left_table_t left_index_table; + int i_mb_top_mbpair_xy; + int topleft_partition; + int b_allow_skip; + int field_decoding_flag; /** thread synchronization ends here */ / subsequent variables are either thread-local or constant, @@ -617,6 +659,7 @@ int8_t mb_transform_size; / transform_size_8x8_flag of each mb / uint16_t slice_table; /* sh->first_mb of the slice that the indexed mb is part of * NOTE: this will fail on resolutions above 2^16 MBs... / + uint8_t field; /* buffer for weighted versions of the reference frames / pixel p_weight_buf[X264_REF_MAX]; @@ -645,6 +688,7 @@ int b_reencode_mb; int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. / int b_deblock_rdo; + int b_overflow; / If CAVLC had a level code overflow during bitstream writing. / struct { @@ -716,11 +760,15 @@ / number of neighbors (top and left) that used 8x8 dct / int i_neighbour_transform_size; - int i_neighbour_interlaced; + int i_neighbour_skip; / neighbor CBPs / int i_cbp_top; int i_cbp_left; + + / extra data required for mbaff in mv prediction / + int16_t topright_mv[2][3][2]; + int8_t topright_ref[2][3]; } cache; / / @@ -739,9 +787,9 @@ int i_chroma_lambda2_offset; / B_direct and weighted prediction / - int16_t dist_scale_factor_buf[2][X264_REF_MAX2][4]; + int16_t dist_scale_factor_buf[2][2][X264_REF_MAX2][4]; int16_t (dist_scale_factor)[4]; - int8_t bipred_weight_buf[2][X264_REF_MAX2][4]; + int8_t bipred_weight_buf[2][2][X264_REF_MAX2][4]; int8_t (bipred_weight)[4]; / maps fref1[0]'s ref indices into the current list0 / #define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2] @@ -776,6 +824,7 @@ int i_mb_partition[17]; int i_mb_cbp[6]; int i_mb_pred_mode[4][13]; + int i_mb_field[3]; / Adaptive direct mv pred / int i_direct_score[2]; / Metrics / @@ -805,6 +854,7 @@ int64_t i_mb_count_ref[2][2][X264_REF_MAX2]; int64_t i_mb_cbp[6]; int64_t i_mb_pred_mode[4][13]; + int64_t i_mb_field[3]; /* / int i_direct_score[2]; int i_direct_frames[2]; @@ -824,8 +874,10 @@ / Buffers that are allocated per-thread even in sliced threads. / void scratch_buffer; /* for any temporary storage that doesn't want repeated malloc / - pixel intra_border_backup[2][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked / - uint8_t (deblock_strength[2])[2][4][4]; + pixel intra_border_backup[5][2]; / bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked / + / Deblock strength values are stored for each 4x4 partition. In MBAFF + * there are four extra values that need to be stored, located in [4][i]. / + uint8_t (deblock_strength[2])[2][8][4]; /* CPU functions dependents */ x264_predict_t predict_16x16[4+3]; @@ -838,6 +890,8 @@ x264_mc_functions_t mc; x264_dct_function_t dctf; x264_zigzag_function_t zigzagf; + x264_zigzag_function_t zigzagf_interlaced; + x264_zigzag_function_t zigzagf_progressive; x264_quant_function_t quantf; x264_deblock_function_t loopf; x264_bitstream_function_t bsf; @@ -850,11 +904,12 @@ // included at the end because it needs x264_t #include "macroblock.h" -#include "rectangle.h" -#if HAVE_MMX +#if ARCH_X86 \|\| ARCH_X86_64 #include "x86/util.h" #endif +#include "rectangle.h" + #endif
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/cpu.c ^
@@ -45,7 +45,8 @@ #include <machine/cpu.h> #endif -const x264_cpu_name_t x264_cpu_names[] = { +const x264_cpu_name_t x264_cpu_names[] = +{ {"Altivec", X264_CPU_ALTIVEC}, // {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore {"MMX2", X264_CPU_MMX\|X264_CPU_MMXEXT}, @@ -357,9 +358,12 @@ #if !HAVE_THREAD return 1; -#elif defined(_WIN32) +#elif SYS_WINDOWS return x264_pthread_num_processors_np(); +#elif SYS_CYGWIN + return sysconf( _SC_NPROCESSORS_ONLN ); + #elif SYS_LINUX unsigned int bit; int np;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/cpu.h ^
@@ -31,7 +31,16 @@ void x264_cpu_emms( void ); void x264_cpu_sfence( void ); #if HAVE_MMX +/* There is no way to forbid the compiler from using float instructions + * before the emms so miscompilation could theoretically occur in the + * unlikely event that the compiler reorders emms and float instructions. / +#if HAVE_X86_INLINE_ASM +/ Clobbering memory makes the compiler less likely to reorder code. */ +#define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \ + "st(3)","st(4)","st(5)","st(6)","st(7)" ) +#else #define x264_emms() x264_cpu_emms() +#endif #else #define x264_emms() #endif @@ -53,9 +62,10 @@ #define x264_stack_align(func,...) func(__VA_ARGS__) #endif -typedef struct { +typedef struct +{ const char name[16]; - int flags; + uint32_t flags; } x264_cpu_name_t; extern const x264_cpu_name_t x264_cpu_names[];
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/dct.c ^
@@ -746,123 +746,117 @@ } } -void x264_zigzag_init( int cpu, x264_zigzag_function_t pf, int b_interlaced ) +void x264_zigzag_init( int cpu, x264_zigzag_function_t pf_progressive, x264_zigzag_function_t *pf_interlaced ) { - if( b_interlaced ) - { - pf->scan_8x8 = zigzag_scan_8x8_field; - pf->scan_4x4 = zigzag_scan_4x4_field; - pf->sub_8x8 = zigzag_sub_8x8_field; - pf->sub_4x4 = zigzag_sub_4x4_field; - pf->sub_4x4ac = zigzag_sub_4x4ac_field; + pf_interlaced->scan_8x8 = zigzag_scan_8x8_field; + pf_progressive->scan_8x8 = zigzag_scan_8x8_frame; + pf_interlaced->scan_4x4 = zigzag_scan_4x4_field; + pf_progressive->scan_4x4 = zigzag_scan_4x4_frame; + pf_interlaced->sub_8x8 = zigzag_sub_8x8_field; + pf_progressive->sub_8x8 = zigzag_sub_8x8_frame; + pf_interlaced->sub_4x4 = zigzag_sub_4x4_field; + pf_progressive->sub_4x4 = zigzag_sub_4x4_frame; + pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field; + pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame; + #if HIGH_BIT_DEPTH #if HAVE_MMX - if( cpu&X264_CPU_SSE2 ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; - if( cpu&X264_CPU_SSE4 ) - pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; - if( cpu&X264_CPU_AVX ) - pf->scan_8x8 = x264_zigzag_scan_8x8_field_avx; -#endif // HAVE_MMX -#else -#if HAVE_MMX - if( cpu&X264_CPU_MMXEXT ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; - pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext; - } - if( cpu&X264_CPU_SSSE3 ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3; - } - if( cpu&X264_CPU_AVX ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_field_avx; -#if ARCH_X86_64 - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_avx; -#endif - } -#endif // HAVE_MMX -#if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; -#endif -#endif // HIGH_BIT_DEPTH - } - else + if( cpu&X264_CPU_SSE2 ) { - pf->scan_8x8 = zigzag_scan_8x8_frame; - pf->scan_4x4 = zigzag_scan_4x4_frame; - pf->sub_8x8 = zigzag_sub_8x8_frame; - pf->sub_4x4 = zigzag_sub_4x4_frame; - pf->sub_4x4ac = zigzag_sub_4x4ac_frame; -#if HIGH_BIT_DEPTH -#if HAVE_MMX - if( cpu&X264_CPU_SSE2 ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; - } + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; + } + if( cpu&X264_CPU_SSE4 ) + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; + if( cpu&X264_CPU_AVX ) + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx; #if ARCH_X86_64 - if( cpu&X264_CPU_AVX ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; - } + if( cpu&X264_CPU_AVX ) + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; + } #endif // ARCH_X86_64 #endif // HAVE_MMX #else #if HAVE_MMX - if( cpu&X264_CPU_MMX ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; - if( cpu&X264_CPU_MMXEXT ) - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; - if( cpu&X264_CPU_SSE2_IS_FAST ) - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; - if( cpu&X264_CPU_SSSE3 ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; - } - if( cpu&X264_CPU_AVX ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; + if( cpu&X264_CPU_MMX ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; + if( cpu&X264_CPU_MMXEXT ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; + } + if( cpu&X264_CPU_SSE2_IS_FAST ) + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; + if( cpu&X264_CPU_SSSE3 ) + { + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; + pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; + } + if( cpu&X264_CPU_AVX ) + { + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; #if ARCH_X86_64 - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; + pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; #endif - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; - } + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + } #endif // HAVE_MMX #if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; + if( cpu&X264_CPU_ALTIVEC ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; + } #endif #if HAVE_ARMV6 - if( cpu&X264_CPU_NEON ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; + if( cpu&X264_CPU_NEON ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; #endif #endif // HIGH_BIT_DEPTH - } - pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; #if HAVE_MMX #if HIGH_BIT_DEPTH if( cpu&X264_CPU_SSE2 ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + } if( cpu&X264_CPU_AVX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + } #else if( cpu&X264_CPU_MMX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; + } if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + } + if( cpu&X264_CPU_AVX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + } #endif // HIGH_BIT_DEPTH #endif }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/dct.h ^
@@ -132,6 +132,6 @@ void x264_dct_init( int cpu, x264_dct_function_t dctf ); void x264_dct_init_weights( void ); -void x264_zigzag_init( int cpu, x264_zigzag_function_t pf, int b_interlaced ); +void x264_zigzag_init( int cpu, x264_zigzag_function_t pf_progressive, x264_zigzag_function_t pf_interlaced ); #endif
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/deblock.c ^
@@ -75,6 +75,37 @@ #define tc0_table(x) i_tc0_table[(x)+24] /* From ffmpeg / +static ALWAYS_INLINE void deblock_edge_luma_c( pixel pix, int xstride, int alpha, int beta, int8_t tc0 ) +{ + int p2 = pix[-3xstride]; + int p1 = pix[-2xstride]; + int p0 = pix[-1xstride]; + int q0 = pix[ 0xstride]; + int q1 = pix[ 1xstride]; + int q2 = pix[ 2xstride]; + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + int tc = tc0; + int delta; + if( abs( p2 - p0 ) < beta ) + { + if( tc0 ) + pix[-2xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 ); + tc++; + } + if( abs( q2 - q0 ) < beta ) + { + if( tc0 ) + pix[ 1xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 ); + tc++; + } + + delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-1xstride] = x264_clip_pixel( p0 + delta ); / p0' / + pix[ 0xstride] = x264_clip_pixel( q0 - delta ); /* q0' / + } +} static inline void deblock_luma_c( pixel pix, int xstride, int ystride, int alpha, int beta, int8_t tc0 ) { for( int i = 0; i < 4; i++ ) @@ -84,40 +115,15 @@ pix += 4ystride; continue; } - for( int d = 0; d < 4; d++ ) - { - int p2 = pix[-3xstride]; - int p1 = pix[-2xstride]; - int p0 = pix[-1xstride]; - int q0 = pix[ 0xstride]; - int q1 = pix[ 1xstride]; - int q2 = pix[ 2xstride]; - - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) - { - int tc = tc0[i]; - int delta; - if( abs( p2 - p0 ) < beta ) - { - if( tc0[i] ) - pix[-2xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] ); - tc++; - } - if( abs( q2 - q0 ) < beta ) - { - if( tc0[i] ) - pix[ 1xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] ); - tc++; - } - - delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1xstride] = x264_clip_pixel( p0 + delta ); / p0' / - pix[ 0xstride] = x264_clip_pixel( q0 - delta ); /* q0' / - } - pix += ystride; - } + for( int d = 0; d < 4; d++, pix += ystride ) + deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] ); } } +static inline void deblock_v_luma_mbaff_c( pixel pix, int stride, int alpha, int beta, int8_t tc0 ) +{ + for( int d = 0; d < 8; d++, pix += stride ) + deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] ); +} static void deblock_v_luma_c( pixel pix, int stride, int alpha, int beta, int8_t tc0 ) { deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); @@ -127,6 +133,20 @@ deblock_luma_c( pix, 1, stride, alpha, beta, tc0 ); } +static ALWAYS_INLINE void deblock_edge_chroma_c( pixel pix, int xstride, int alpha, int beta, int8_t tc ) +{ + int p1 = pix[-2xstride]; + int p0 = pix[-1xstride]; + int q0 = pix[ 0xstride]; + int q1 = pix[ 1xstride]; + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-1xstride] = x264_clip_pixel( p0 + delta ); / p0' / + pix[ 0xstride] = x264_clip_pixel( q0 - delta ); /* q0' / + } +} static inline void deblock_chroma_c( pixel pix, int xstride, int ystride, int alpha, int beta, int8_t tc0 ) { for( int i = 0; i < 4; i++ ) @@ -139,21 +159,14 @@ } for( int d = 0; d < 2; d++, pix += ystride-2 ) for( int e = 0; e < 2; e++, pix++ ) - { - int p1 = pix[-2xstride]; - int p0 = pix[-1xstride]; - int q0 = pix[ 0xstride]; - int q1 = pix[ 1xstride]; - - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) - { - int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1xstride] = x264_clip_pixel( p0 + delta ); /* p0' / - pix[ 0xstride] = x264_clip_pixel( q0 - delta ); /* q0' / - } - } + deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); } } +static inline void deblock_v_chroma_mbaff_c( pixel pix, int stride, int alpha, int beta, int8_t tc0 ) +{ + for( int i = 0; i < 4; i++, pix += stride ) + deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] ); +} static void deblock_v_chroma_c( pixel pix, int stride, int alpha, int beta, int8_t tc0 ) { deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); @@ -163,49 +176,55 @@ deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); } -static inline void deblock_luma_intra_c( pixel pix, int xstride, int ystride, int alpha, int beta ) +static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel pix, int xstride, int alpha, int beta ) { - for( int d = 0; d < 16; d++ ) - { - int p2 = pix[-3xstride]; - int p1 = pix[-2xstride]; - int p0 = pix[-1xstride]; - int q0 = pix[ 0xstride]; - int q1 = pix[ 1xstride]; - int q2 = pix[ 2xstride]; + int p2 = pix[-3xstride]; + int p1 = pix[-2xstride]; + int p0 = pix[-1xstride]; + int q0 = pix[ 0xstride]; + int q1 = pix[ 1xstride]; + int q2 = pix[ 2xstride]; - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + if( abs( p0 - q0 ) < ((alpha >> 2) + 2) ) { - if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ) + if( abs( p2 - p0 ) < beta ) / p0', p1', p2' / { - if( abs( p2 - p0 ) < beta ) / p0', p1', p2' / - { - const int p3 = pix[-4xstride]; - pix[-1xstride] = ( p2 + 2p1 + 2p0 + 2q0 + q1 + 4 ) >> 3; - pix[-2xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; - pix[-3xstride] = ( 2p3 + 3p2 + p1 + p0 + q0 + 4 ) >> 3; - } - else /* p0' / - pix[-1xstride] = ( 2p1 + p0 + q1 + 2 ) >> 2; - if( abs( q2 - q0 ) < beta ) / q0', q1', q2' / - { - const int q3 = pix[3xstride]; - pix[0xstride] = ( p1 + 2p0 + 2q0 + 2q1 + q2 + 4 ) >> 3; - pix[1xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; - pix[2xstride] = ( 2q3 + 3q2 + q1 + q0 + p0 + 4 ) >> 3; - } - else /* q0' / - pix[0xstride] = ( 2q1 + q0 + p1 + 2 ) >> 2; + const int p3 = pix[-4xstride]; + pix[-1xstride] = ( p2 + 2p1 + 2p0 + 2q0 + q1 + 4 ) >> 3; + pix[-2xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; + pix[-3xstride] = ( 2p3 + 3p2 + p1 + p0 + q0 + 4 ) >> 3; } - else /* p0', q0' / - { + else / p0' / pix[-1xstride] = ( 2p1 + p0 + q1 + 2 ) >> 2; - pix[ 0xstride] = ( 2q1 + q0 + p1 + 2 ) >> 2; + if( abs( q2 - q0 ) < beta ) / q0', q1', q2' / + { + const int q3 = pix[3xstride]; + pix[0xstride] = ( p1 + 2p0 + 2q0 + 2q1 + q2 + 4 ) >> 3; + pix[1xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; + pix[2xstride] = ( 2q3 + 3q2 + q1 + q0 + p0 + 4 ) >> 3; } + else /* q0' / + pix[0xstride] = ( 2q1 + q0 + p1 + 2 ) >> 2; + } + else / p0', q0' / + { + pix[-1xstride] = ( 2p1 + p0 + q1 + 2 ) >> 2; + pix[ 0xstride] = ( 2q1 + q0 + p1 + 2 ) >> 2; } - pix += ystride; } } +static inline void deblock_luma_intra_c( pixel pix, int xstride, int ystride, int alpha, int beta ) +{ + for( int d = 0; d < 16; d++, pix += ystride ) + deblock_edge_luma_intra_c( pix, xstride, alpha, beta ); +} +static inline void deblock_v_luma_intra_mbaff_c( pixel pix, int ystride, int alpha, int beta ) +{ + for( int d = 0; d < 8; d++, pix += ystride ) + deblock_edge_luma_intra_c( pix, 1, alpha, beta ); +} static void deblock_v_luma_intra_c( pixel pix, int stride, int alpha, int beta ) { deblock_luma_intra_c( pix, stride, 1, alpha, beta ); @@ -215,22 +234,29 @@ deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } +static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel pix, int xstride, int alpha, int beta ) +{ + int p1 = pix[-2xstride]; + int p0 = pix[-1xstride]; + int q0 = pix[ 0xstride]; + int q1 = pix[ 1xstride]; + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + pix[-1xstride] = (2p1 + p0 + q1 + 2) >> 2; / p0' / + pix[ 0xstride] = (2q1 + q0 + p1 + 2) >> 2; / q0' / + } +} static inline void deblock_chroma_intra_c( pixel pix, int xstride, int ystride, int alpha, int beta, int dir ) { for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) for( int e = 0; e < (dir?1:2); e++, pix++ ) - { - int p1 = pix[-2xstride]; - int p0 = pix[-1xstride]; - int q0 = pix[ 0xstride]; - int q1 = pix[ 1xstride]; - - if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) - { - pix[-1xstride] = (2p1 + p0 + q1 + 2) >> 2; /* p0' / - pix[ 0xstride] = (2q1 + q0 + p1 + 2) >> 2; / q0' / - } - } + deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); +} +static inline void deblock_v_chroma_intra_mbaff_c( pixel pix, int stride, int alpha, int beta ) +{ + for( int i = 0; i < 4; i++, pix += stride ) + deblock_edge_chroma_intra_c( pix, 2, alpha, beta ); } static void deblock_v_chroma_intra_c( pixel pix, int stride, int alpha, int beta ) { @@ -242,8 +268,8 @@ } static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, - int bframe ) + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, + int bframe, x264_t h ) { for( int dir = 0; dir < 2; dir++ ) { @@ -270,6 +296,162 @@ } } +void deblock_strength_mbaff_c( uint8_t nnz_cache[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t h ) +{ + int neighbour_field[2]; + neighbour_field[0] = h->mb.i_mb_left_xy[0] >= 0 && h->mb.field[h->mb.i_mb_left_xy[0]]; + neighbour_field[1] = h->mb.i_mb_top_xy >= 0 && h->mb.field[h->mb.i_mb_top_xy]; + int intra_cur = IS_INTRA( h->mb.i_type ); + + if( !intra_cur ) + { + for( int dir = 0; dir < 2; dir++ ) + { + int edge_stride = dir ? 8 : 1; + int part_stride = dir ? 1 : 8; + for( int edge = 0; edge < 4; edge++ ) + { + for( int i = 0, q = X264_SCAN8_0+edgeedge_stride; i < 4; i++, q += part_stride ) + { + int p = q - edge_stride; + if( nnz_cache[q] \|\| nnz_cache[p] ) + { + bs[dir][edge][i] = 2; + } + else if( (edge == 0 && MB_INTERLACED != neighbour_field[dir]) \|\| + ref[0][q] != ref[0][p] \|\| + abs( mv[0][q][0] - mv[0][p][0] ) >= 4 \|\| + abs( mv[0][q][1] - mv[0][p][1] ) >= mvy_limit \|\| + (bframe && (ref[1][q] != ref[1][p] \|\| + abs( mv[1][q][0] - mv[1][p][0] ) >= 4 \|\| + abs( mv[1][q][1] - mv[1][p][1] ) >= mvy_limit )) ) + { + bs[dir][edge][i] = 1; + } + else + bs[dir][edge][i] = 0; + } + } + } + } + + if( h->mb.i_neighbour & MB_LEFT ) + { + if( h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) + { + static const uint8_t offset[2][2][8] = { + { { 0, 0, 0, 0, 1, 1, 1, 1 }, + { 2, 2, 2, 2, 3, 3, 3, 3 }, }, + { { 0, 1, 2, 3, 0, 1, 2, 3 }, + { 0, 1, 2, 3, 0, 1, 2, 3 }, } + }; + uint8_t bS[8]; + + if( intra_cur ) + memset( bS, 4, 8 ); + else + { + const uint8_t off = offset[MB_INTERLACED][h->mb.i_mb_y&1]; + uint8_t (nnz)[24] = h->mb.non_zero_count; + + for( int i = 0; i < 8; i++ ) + { + int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1]; + int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8(i>>1)]; + int nnz_left = nnz[left][3 + 4off[i]]; + if( !h->param.b_cabac && h->pps->b_transform_8x8_mode ) + { + int j = off[i]&~1; + if( h->mb.mb_transform_size[left] ) + nnz_left = !!(M16( &nnz[left][2+4j] ) \| M16( &nnz[left][2+4(1+j)] )); + } + if( IS_INTRA( h->mb.type[left] ) ) + bS[i] = 4; + else if( nnz_left \|\| nnz_this ) + bS[i] = 2; + else // As left is different interlaced. + bS[i] = 1; + } + } + + if( MB_INTERLACED ) + { + for( int i = 0; i < 4; i++ ) bs[0][0][i] = bS[i]; + for( int i = 0; i < 4; i++ ) bs[0][4][i] = bS[4+i]; + } + else + { + for( int i = 0; i < 4; i++ ) bs[0][0][i] = bS[2i]; + for( int i = 0; i < 4; i++ ) bs[0][4][i] = bS[1+2i]; + } + } + } + + if( h->mb.i_neighbour & MB_TOP ) + { + if( !(h->mb.i_mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] ) + { + /* Need to filter both fields (even for frame macroblocks). + * Filter top two rows using the top macroblock of the above + * pair and then the bottom one. / + int mbn_xy = h->mb.i_mb_xy - 2 h->mb.i_mb_stride; + uint32_t nnz_cur[4]; + nnz_cur[0] = h->mb.cache.non_zero_count[x264_scan8[0]+0]; + nnz_cur[1] = h->mb.cache.non_zero_count[x264_scan8[0]+1]; + nnz_cur[2] = h->mb.cache.non_zero_count[x264_scan8[0]+2]; + nnz_cur[3] = h->mb.cache.non_zero_count[x264_scan8[0]+3]; + /* Munge NNZ for cavlc + 8x8dct / + if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && + h->mb.mb_transform_size[h->mb.i_mb_xy] ) + { + int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) \| M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); + int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) \| M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] ); + nnz_cur[0] = nnz_cur[1] = !!nnz0; + nnz_cur[2] = nnz_cur[3] = !!nnz1; + } + + for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride ) + { + int mbn_intra = IS_INTRA( h->mb.type[mbn_xy] ); + uint8_t (nnz)[24] = h->mb.non_zero_count; + + uint32_t nnz_top[4]; + nnz_top[0] = nnz[mbn_xy][34+0]; + nnz_top[1] = nnz[mbn_xy][34+1]; + nnz_top[2] = nnz[mbn_xy][34+2]; + nnz_top[3] = nnz[mbn_xy][34+3]; + + if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && + (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[mbn_xy] ) + { + int nnz_top0 = M16( &nnz[mbn_xy][8] ) \| M16( &nnz[mbn_xy][12] ); + int nnz_top1 = M16( &nnz[mbn_xy][10] ) \| M16( &nnz[mbn_xy][14] ); + nnz_top[0] = nnz_top[1] = nnz_top0 ? 0x0101 : 0; + nnz_top[2] = nnz_top[3] = nnz_top1 ? 0x0101 : 0; + } + + uint8_t bS[4]; + if( intra_cur \|\| mbn_intra ) + M32( bS ) = 0x03030303; + else + { + for( int i = 0; i < 4; i++ ) + { + if( nnz_cur[i] \|\| nnz_top[i] ) + bS[i] = 2; + else + bS[i] = 1; + } + } + for( int i = 0; i < 4; i++ ) + bs[1][4j][i] = bS[i]; + } + } + } +} + static inline void deblock_edge( x264_t h, pixel pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; @@ -304,12 +486,10 @@ void x264_frame_deblock_row( x264_t h, int mb_y ) { - int b_interlaced = h->sh.b_mbaff; + int b_interlaced = SLICE_MBAFF; int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset ); int stridey = h->fdec->i_stride[0]; - int stride2y = stridey << b_interlaced; int strideuv = h->fdec->i_stride[1]; - int stride2uv = strideuv << b_interlaced; for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced \| mb_y)&1, mb_y ^= b_interlaced ) { @@ -319,16 +499,18 @@ int mb_xy = h->mb.i_mb_xy; int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy]; int intra_cur = IS_INTRA( h->mb.type[mb_xy] ); - uint8_t (bs)[4][4] = h->deblock_strength[mb_y&1][mb_x]; + uint8_t (bs)[8][4] = h->deblock_strength[mb_y&1][mb_x]; pixel pixy = h->fdec->plane[0] + 16mb_ystridey + 16mb_x; pixel pixuv = h->fdec->plane[1] + 8mb_ystrideuv + 16mb_x; - if( mb_y & b_interlaced ) + if( mb_y & MB_INTERLACED ) { pixy -= 15stridey; pixuv -= 7strideuv; } + int stride2y = stridey << MB_INTERLACED; + int stride2uv = strideuv << MB_INTERLACED; int qp = h->mb.qp[mb_xy]; int qpc = h->chroma_qp_table[qp]; int first_edge_only = h->mb.type[mb_xy] == P_SKIP \|\| qp <= qp_thresh; @@ -347,16 +529,59 @@ if( h->mb.i_neighbour & MB_LEFT ) { - int qpl = h->mb.qp[h->mb.i_mb_left_xy]; - int qp_left = (qp + qpl + 1) >> 1; - int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1; - int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_left_xy] ); - if( intra_cur \|\| intra_left ) - FILTER( _intra, 0, 0, qp_left, qpc_left ); + if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) + { + int luma_qp[2]; + int chroma_qp[2]; + int left_qp[2]; + int current_qp = h->mb.qp[mb_xy]; + left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]]; + luma_qp[0] = (current_qp + left_qp[0] + 1) >> 1; + chroma_qp[0] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[0]] + 1) >> 1; + if( bs[0][0][0] == 4) + { + deblock_edge_intra( h, pixy, 2stridey, bs[0][0], luma_qp[0], 0, deblock_v_luma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv, 2strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv + 1, 2strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c ); + } + else + { + deblock_edge( h, pixy, 2stridey, bs[0][0], luma_qp[0], 0, deblock_v_luma_mbaff_c ); + deblock_edge( h, pixuv, 2strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c ); + deblock_edge( h, pixuv + 1, 2strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c ); + } + + int offy = MB_INTERLACED ? 4 : 0; + int offuv = MB_INTERLACED ? 3 : 0; + left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]]; + luma_qp[1] = (current_qp + left_qp[1] + 1) >> 1; + chroma_qp[1] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[1]] + 1) >> 1; + if( bs[0][4][0] == 4) + { + deblock_edge_intra( h, pixy + (stridey<<offy), 2stridey, bs[0][4], luma_qp[1], 0, deblock_v_luma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c ); + deblock_edge_intra( h, pixuv + 1 + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c ); + } + else + { + deblock_edge( h, pixy + (stridey<<offy), 2stridey, bs[0][4], luma_qp[1], 0, deblock_v_luma_mbaff_c ); + deblock_edge( h, pixuv + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_mbaff_c ); + deblock_edge( h, pixuv + 1 + (strideuv<<offuv), 2strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_mbaff_c ); + } + } else - FILTER( , 0, 0, qp_left, qpc_left ); - } + { + int qpl = h->mb.qp[h->mb.i_mb_xy-1]; + int qp_left = (qp + qpl + 1) >> 1; + int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1; + int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] ); + if( intra_cur \|\| intra_left ) + FILTER( _intra, 0, 0, qp_left, qpc_left ); + else + FILTER( , 0, 0, qp_left, qpc_left ); + } + } if( !first_edge_only ) { if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc ); @@ -366,17 +591,39 @@ if( h->mb.i_neighbour & MB_TOP ) { - int qpt = h->mb.qp[h->mb.i_mb_top_xy]; - int qp_top = (qp + qpt + 1) >> 1; - int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; - int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] ); - if( ~b_interlaced & (intra_cur \| intra_top) ) - FILTER( _intra, 1, 0, qp_top, qpc_top ); + if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] ) + { + int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride; + + for(int j=0; j<2; j++, mbn_xy += h->mb.i_mb_stride) + { + int qpt = h->mb.qp[mbn_xy]; + int qp_top = (qp + qpt + 1) >> 1; + int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + + // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows + deblock_edge( h, pixy + jstridey, 2 stridey, bs[1][4j], qp_top, 0, deblock_v_luma_c ); + deblock_edge( h, pixuv + jstrideuv, 2strideuv, bs[1][4j], qpc_top, 1, deblock_v_chroma_c ); + } + } else { - if( intra_top ) - M32( bs[1][0] ) = 0x03030303; - FILTER( , 1, 0, qp_top, qpc_top ); + int qpt = h->mb.qp[h->mb.i_mb_top_xy]; + int qp_top = (qp + qpt + 1) >> 1; + int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] ); + + if( (!b_interlaced \|\| (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) + && (intra_cur \|\| intra_top) ) + { + FILTER( _intra, 1, 0, qp_top, qpc_top ); + } + else + { + if( intra_top ) + M32( bs[1][0] ) = 0x03030303; + FILTER( , 1, 0, qp_top, qpc_top ); + } } } @@ -401,17 +648,17 @@ / void x264_macroblock_deblock( x264_t h ) { - int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset ); + int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); int qp = h->mb.i_qp; if( qp <= qp_thresh \|\| h->mb.i_type == P_SKIP ) return; - uint8_t (bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; + uint8_t (bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; if( IS_INTRA( h->mb.i_type ) ) - memset( bs, 3, 244sizeof(uint8_t) ); + memset( bs, 3, 284sizeof(uint8_t) ); else h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, - bs, 4 >> h->sh.b_mbaff, h->sh.i_type == SLICE_TYPE_B ); + bs, 4 >> SLICE_MBAFF, h->sh.i_type == SLICE_TYPE_B, h ); int transform_8x8 = h->mb.b_transform_8x8; pixel fdec = h->mb.pic.p_fdec[0]; @@ -453,17 +700,17 @@ void x264_deblock_h_chroma_intra_sse2( pixel pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_avx ( pixel pix, int stride, int alpha, int beta ); void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t h ); void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t h ); void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t h ); void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], - int mvy_limit, int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe, x264_t h ); #if ARCH_X86 void x264_deblock_h_luma_mmxext( pixel pix, int stride, int alpha, int beta, int8_t tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ); @@ -505,7 +752,7 @@ void x264_deblock_h_chroma_neon( uint8_t , int, int, int, int8_t * ); #endif -void x264_deblock_init( int cpu, x264_deblock_function_t pf ) +void x264_deblock_init( int cpu, x264_deblock_function_t pf, int b_mbaff ) { pf->deblock_luma[1] = deblock_v_luma_c; pf->deblock_luma[0] = deblock_h_luma_c; @@ -585,4 +832,6 @@ } #endif #endif // !HIGH_BIT_DEPTH + + if( b_mbaff ) pf->deblock_strength = deblock_strength_mbaff_c; }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/frame.c ^
@@ -48,7 +48,7 @@ int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines; - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int luma_plane_size, chroma_plane_size; int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16; int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10; @@ -100,20 +100,35 @@ CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) ); frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH; + if( PARAM_INTERLACED ) + { + CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) ); + frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH; + } /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. / if( h->param.analyse.i_subpel_refine && b_fdec ) { + / FIXME: Don't allocate both buffers in non-adaptive MBAFF. / CHECKED_MALLOC( frame->buffer[0], 4luma_plane_size * sizeof(pixel) ); + if( PARAM_INTERLACED ) + CHECKED_MALLOC( frame->buffer_fld[0], 4luma_plane_size sizeof(pixel) ); for( int i = 0; i < 4; i++ ) + { frame->filtered[i] = frame->buffer[0] + iluma_plane_size + frame->i_stride[0] i_padv + PADH; + frame->filtered_fld[i] = frame->buffer_fld[0] + iluma_plane_size + frame->i_stride[0] i_padv + PADH; + } frame->plane[0] = frame->filtered[0]; + frame->plane_fld[0] = frame->filtered_fld[0]; } else { CHECKED_MALLOC( frame->buffer[0], luma_plane_size * sizeof(pixel) ); + if( PARAM_INTERLACED ) + CHECKED_MALLOC( frame->buffer_fld[0], luma_plane_size * sizeof(pixel) ); frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH; + frame->filtered_fld[0] = frame->plane_fld[0] = frame->buffer_fld[0] + frame->i_stride[0] * i_padv + PADH; } frame->b_duplicate = 0; @@ -139,12 +154,15 @@ } CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) ); + CHECKED_MALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) ); if( h->param.analyse.i_me_method >= X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2i_padv) sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); frame->integral = (uint16_t)frame->buffer[3] + frame->i_stride[0] i_padv + PADH; } + if( PARAM_INTERLACED ) + CHECKED_MALLOC( frame->field, i_mb_count * sizeof(uint8_t) ); } else /* fenc frame / { @@ -162,7 +180,7 @@ CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2h->mb.i_mb_countsizeof(int16_t) ); CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_countsizeof(int) ); } - CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) ); + CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); @@ -198,7 +216,10 @@ if( !frame->b_duplicate ) { for( int i = 0; i < 4; i++ ) + { x264_free( frame->buffer[i] ); + x264_free( frame->buffer_fld[i] ); + } for( int i = 0; i < 4; i++ ) x264_free( frame->buffer_lowres[i] ); for( int i = 0; i < X264_BFRAME_MAX+2; i++ ) @@ -219,6 +240,8 @@ x264_free( frame->i_inv_qscale_factor ); x264_free( frame->i_row_bits ); x264_free( frame->f_row_qp ); + x264_free( frame->f_row_qscale ); + x264_free( frame->field ); x264_free( frame->mb_type ); x264_free( frame->mb_partition ); x264_free( frame->mv[0] ); @@ -310,23 +333,56 @@ static void ALWAYS_INLINE pixel_memset( pixel dst, pixel src, int len, int size ) { uint8_t dstp = (uint8_t)dst; - if( size == 1 ) - memset(dst, src, len); - else if( size == 2 ) + uint8_t v1 = src; + uint16_t v2 = size == 1 ? v1 + (v1 << 8) : M16( src ); + uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src ); + int i = 0; + len = size; + + / Align the input pointer if it isn't already / + if( (intptr_t)dstp & (WORD_SIZE - 1) ) + { + if( size <= 2 && ((intptr_t)dstp & 3) ) + { + if( size == 1 && ((intptr_t)dstp & 1) ) + dstp[i++] = v1; + if( (intptr_t)dstp & 2 ) + { + M16( dstp+i ) = v2; + i += 2; + } + } + if( WORD_SIZE == 8 && (intptr_t)dstp & 4 ) + { + M32( dstp+i ) = v4; + i += 4; + } + } + + / Main copy loop / + if( WORD_SIZE == 8 ) { - int v = M16( src ); - for( int i = 0; i < len; i++ ) - M16( dstp+i2 ) = v; + uint64_t v8 = v4 + ((uint64_t)v4<<32); + for( ; i < len - 7; i+=8 ) + M64( dstp+i ) = v8; } - else if( size == 4 ) + for( ; i < len - 3; i+=4 ) + M32( dstp+i ) = v4; + + /* Finish up the last few bytes / + if( size <= 2 ) { - int v = M32( src ); - for( int i = 0; i < len; i++ ) - M32( dstp+i4 ) = v; + if( i < len - 1 ) + { + M16( dstp+i ) = v2; + i += 2; + } + if( size == 1 && i != len ) + dstp[i] = v1; } } -static void plane_expand_border( pixel pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) +static void ALWAYS_INLINE plane_expand_border( pixel pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) { #define PPIXEL(x, y) ( pix + (x) + (y)i_stride ) for( int y = 0; y < i_height; y++ ) @@ -350,26 +406,35 @@ void x264_frame_expand_border( x264_t h, x264_frame_t frame, int mb_y, int b_end ) { int b_start = !mb_y; - if( mb_y & h->sh.b_mbaff ) + if( mb_y & SLICE_MBAFF ) return; for( int i = 0; i < frame->i_plane; i++ ) { int stride = frame->i_stride[i]; int width = 16h->sps->i_mb_width; - int height = (b_end ? 16(h->mb.i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i; + int height = (b_end ? 16(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> !!i; int padh = PADH; int padv = PADV >> !!i; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb - pixel pix = frame->plane[i] + X264_MAX(0, (16mb_y-4)stride >> !!i); if( b_end && !b_start ) - height += 4 >> (!!i + h->sh.b_mbaff); - if( h->sh.b_mbaff ) + height += 4 >> (!!i + SLICE_MBAFF); + pixel pix; + if( SLICE_MBAFF ) { + // border samples for each field are extended separately + pix = frame->plane_fld[i] + X264_MAX(0, (16mb_y-4)stride >> !!i); plane_expand_border( pix, stride2, width, height, padh, padv, b_start, b_end, i ); plane_expand_border( pix+stride, stride2, width, height, padh, padv, b_start, b_end, i ); + + height = (b_end ? 16(h->mb.i_mb_height - mb_y) : 32) >> !!i; + if( b_end && !b_start ) + height += 4 >> (!!i); + pix = frame->plane[i] + X264_MAX(0, (16mb_y-4)stride >> !!i); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } else { + pix = frame->plane[i] + X264_MAX(0, (16mb_y-4)stride >> !!i); plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } } @@ -383,20 +448,22 @@ int b_start = !mb_y; int stride = frame->i_stride[0]; int width = 16h->mb.i_mb_width + 8; - int height = b_end ? (16(h->mb.i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16; + int height = b_end ? (16(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16; int padh = PADH - 4; int padv = PADV - 8; for( int i = 1; i < 4; i++ ) { // buffer: 8 luma, to match the hpel filter - pixel pix = frame->filtered[i] + (16mb_y - (8 << h->sh.b_mbaff)) * stride - 4; - if( h->sh.b_mbaff ) + pixel pix; + if( SLICE_MBAFF ) { + pix = frame->filtered_fld[i] + (16mb_y - 16) * stride - 4; plane_expand_border( pix, stride2, width, height, padh, padv, b_start, b_end, 0 ); plane_expand_border( pix+stride, stride2, width, height, padh, padv, b_start, b_end, 0 ); } - else - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 ); + + pix = frame->filtered[i] + (16mb_y - 8) stride - 4; + plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 ); } } @@ -426,12 +493,28 @@ { for( int y = i_height; y < i_height + i_pady; y++ ) memcpy( &frame->plane[i][yframe->i_stride[i]], - &frame->plane[i][(i_height-(~y&h->param.b_interlaced)-1)frame->i_stride[i]], + &frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)frame->i_stride[i]], (i_width + i_padx) sizeof(pixel) ); } } } +void x264_expand_border_mbpair( x264_t h, int mb_x, int mb_y ) +{ + for( int i = 0; i < h->fenc->i_plane; i++ ) + { + int stride = h->fenc->i_stride[i]; + int height = h->param.i_height >> !!i; + int pady = (h->mb.i_mb_height 16 - h->param.i_height) >> !!i; + int mbsize = (16>>!!i); + pixel fenc = h->fenc->plane[i] + mbsize mb_x; + for( int y = height; y < height + pady; y++ ) + memcpy( fenc + ystride, + fenc + (height-1)stride, + mbsize * sizeof(pixel) ); + } +} + /* threading / void x264_frame_cond_broadcast( x264_frame_t frame, int i_lines_completed ) {
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/frame.h ^
@@ -72,13 +72,16 @@ int i_width_lowres; int i_lines_lowres; pixel plane[2]; + pixel plane_fld[2]; pixel filtered[4]; / plane[0], H, V, HV / + pixel filtered_fld[4]; pixel lowres[4]; / half-size copy of input frame: Orig, H, V, HV / uint16_t integral; /* for unrestricted mv we allocate more data than needed * allocated data are stored in buffer / pixel buffer[4]; + pixel buffer_fld[4]; pixel buffer_lowres[4]; x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] / @@ -92,6 +95,7 @@ int16_t (mv[2])[2]; int16_t (mv16x16)[2]; int16_t (lowres_mvs[2][X264_BFRAME_MAX+1])[2]; + uint8_t field; / Stored as (lists_used << LOWRES_COST_SHIFT) + (cost). * Doesn't need special addressing for intra cost because @@ -117,6 +121,7 @@ int i_row_satd; int i_row_bits; float f_row_qp; + float f_row_qscale; float f_qp_offset; float f_qp_offset_aq; int b_intra_calculated; @@ -178,8 +183,8 @@ x264_deblock_intra_t deblock_luma_intra[2]; x264_deblock_intra_t deblock_chroma_intra[2]; void (deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, - int bframe ); + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, + int bframe, x264_t h ); } x264_deblock_function_t; x264_frame_t x264_frame_new( x264_t h, int b_fdec ); @@ -191,6 +196,7 @@ void x264_frame_expand_border_filtered( x264_t h, x264_frame_t frame, int mb_y, int b_end ); void x264_frame_expand_border_lowres( x264_frame_t frame ); void x264_frame_expand_border_mod16( x264_t h, x264_frame_t frame ); +void x264_expand_border_mbpair( x264_t h, int mb_x, int mb_y ); void x264_frame_deblock_row( x264_t h, int mb_y ); void x264_macroblock_deblock( x264_t h ); @@ -198,7 +204,7 @@ void x264_frame_filter( x264_t h, x264_frame_t frame, int mb_y, int b_end ); void x264_frame_init_lowres( x264_t h, x264_frame_t frame ); -void x264_deblock_init( int cpu, x264_deblock_function_t pf ); +void x264_deblock_init( int cpu, x264_deblock_function_t pf, int b_mbaff ); void x264_frame_cond_broadcast( x264_frame_t frame, int i_lines_completed ); void x264_frame_cond_wait( x264_frame_t frame, int i_lines_completed );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/macroblock.c ^
@@ -40,7 +40,7 @@ mvx, mvy, 4width, 4height, &h->sh.weight[i_ref][0] ); // chroma is offset if MCing from a field of opposite parity - if( h->mb.b_interlaced & i_ref ) + if( MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)4 - 2; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2yFDEC_STRIDE+2x], @@ -69,7 +69,7 @@ h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0], mvx, mvy, 4width, 4height, weight_none ); - if( h->mb.b_interlaced & i_ref ) + if( MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)4 - 2; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2yFDEC_STRIDE+2x], @@ -101,9 +101,9 @@ h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4yFDEC_STRIDE+4x], FDEC_STRIDE, src0, i_stride0, src1, i_stride1, weight ); - if( h->mb.b_interlaced & i_ref0 ) + if( MB_INTERLACED & i_ref0 ) mvy0 += (h->mb.i_mb_y & 1)4 - 2; - if( h->mb.b_interlaced & i_ref1 ) + if( MB_INTERLACED & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)4 - 2; h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], @@ -212,7 +212,7 @@ h->mb.i_b8_stride = h->mb.i_mb_width 2; h->mb.i_b4_stride = h->mb.i_mb_width * 4; - h->mb.b_interlaced = h->param.b_interlaced; + h->mb.b_interlaced = PARAM_INTERLACED; CHECKED_MALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); @@ -236,7 +236,7 @@ for( int i = 0; i < 2; i++ ) { - int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced; + int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED; if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit @@ -250,7 +250,7 @@ if( h->param.analyse.i_weighted_pred ) { - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int luma_plane_size = 0; int numweightbuf; @@ -314,18 +314,22 @@ int x264_macroblock_thread_allocate( x264_t h, int b_lookahead ) { if( !b_lookahead ) - for( int i = 0; i <= h->param.b_interlaced; i++ ) - { + { + for( int i = 0; i <= 4PARAM_INTERLACED; i++ ) for( int j = 0; j < 2; j++ ) { /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx / CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width16+32) * sizeof(pixel) ); h->intra_border_backup[i][j] += 16; - h->intra_border_backup[1][j] = h->intra_border_backup[i][j]; + if( !PARAM_INTERLACED ) + h->intra_border_backup[1][j] = h->intra_border_backup[i][j]; } + for( int i = 0; i <= PARAM_INTERLACED; i++ ) + { CHECKED_MALLOC( h->deblock_strength[i], sizeof(*h->deblock_strength) h->mb.i_mb_width ); h->deblock_strength[1] = h->deblock_strength[i]; } + } /* Allocate scratch buffer / int scratch_size = 0; @@ -338,7 +342,7 @@ ((me_range2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+3)&~3) * sizeof(int); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); @@ -353,12 +357,13 @@ void x264_macroblock_thread_free( x264_t h, int b_lookahead ) { if( !b_lookahead ) - for( int i = 0; i <= h->param.b_interlaced; i++ ) - { + { + for( int i = 0; i <= PARAM_INTERLACED; i++ ) x264_free( h->deblock_strength[i] ); + for( int i = 0; i <= 4PARAM_INTERLACED; i++ ) for( int j = 0; j < 2; j++ ) x264_free( h->intra_border_backup[i][j] - 16 ); - } + } x264_free( h->scratch_buffer ); } @@ -371,6 +376,7 @@ h->mb.ref[1] = h->fdec->ref[1]; h->mb.type = h->fdec->mb_type; h->mb.partition = h->fdec->mb_partition; + h->mb.field = h->fdec->field; h->fdec->i_ref[0] = h->i_ref[0]; h->fdec->i_ref[1] = h->i_ref[1]; @@ -403,12 +409,12 @@ { deblock_ref_table(-2) = -2; deblock_ref_table(-1) = -1; - for( int i = 0; i < h->i_ref[0] << h->sh.b_mbaff; i++ ) + for( int i = 0; i < h->i_ref[0] << SLICE_MBAFF; i++ ) { /* Mask off high bits to avoid frame num collisions with -1/-2. * In current x264 frame num values don't cover a range of more * than 32, so 6 bits is enough for uniqueness. / - if( !h->mb.b_interlaced ) + if( !MB_INTERLACED ) deblock_ref_table(i) = h->fref[0][i]->i_frame_num&63; else deblock_ref_table(i) = ((h->fref[0][i>>1]->i_frame_num&63)<<1) + (i&1); @@ -420,7 +426,7 @@ memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) ); if( h->i_ref[0] > 0 ) - for( int field = 0; field <= h->sh.b_mbaff; field++ ) + for( int field = 0; field <= SLICE_MBAFF; field++ ) { int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; int refpoc = h->fref[0][0]->i_poc + h->fref[0][0]->i_delta_poc[field]; @@ -452,7 +458,7 @@ (h->sh.i_type == SLICE_TYPE_B && h->mb.i_subpel_refine >= 9)); h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B \|\| (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I); - + h->mb.i_mb_prev_xy = -1; / fdec: fenc: * yyyyyyy @@ -489,18 +495,20 @@ dst[iFDEC_STRIDE] = src[iFDEC_STRIDE]; } -static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t h, int mb_x, int mb_y, int i, int b_interlaced ) +static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t h, int mb_x, int mb_y, int i, int b_mbaff ) { int w = (i ? 8 : 16); int i_stride = h->fdec->i_stride[i]; - int i_stride2 = i_stride << b_interlaced; - int i_pix_offset = b_interlaced + int i_stride2 = i_stride << MB_INTERLACED; + int i_pix_offset = MB_INTERLACED ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride : 16 * mb_x + w * mb_y * i_stride; pixel plane_fdec = &h->fdec->plane[i][i_pix_offset]; - pixel intra_fdec = &h->intra_border_backup[mb_y&1][i][mb_x16]; + int fdec_idx = b_mbaff ? (MB_INTERLACED ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0; + pixel intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x16]; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; - if( b_interlaced ) + / ref_pix_offset[0] references the current field and [1] the opposite field. / + if( MB_INTERLACED ) ref_pix_offset[1] += (1-2(mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; @@ -509,13 +517,20 @@ h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 ); memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8sizeof(pixel) ); memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8sizeof(pixel) ); + if( b_mbaff ) + { + h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8]; + h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1]; + } } else { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 ); memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24sizeof(pixel) ); + if( b_mbaff ) + h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = intra_fdec[-1]; } - if( b_interlaced ) + if( b_mbaff ) { for( int j = 0; j < w; j++ ) if( i ) @@ -526,15 +541,28 @@ else h->mb.pic.p_fdec[0][-1+jFDEC_STRIDE] = plane_fdec[-1+ji_stride2]; } + pixel plane_src, *filtered_src; for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) { - h->mb.pic.p_fref[0][j][i?4:0] = &h->fref[0][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; + // Interpolate between pixels in same field. + if( MB_INTERLACED ) + { + plane_src = h->fref[0][j>>1]->plane_fld[i]; + filtered_src = h->fref[0][j>>1]->filtered_fld; + } + else + { + plane_src = h->fref[0][j]->plane[i]; + filtered_src = h->fref[0][j]->filtered; + } + h->mb.pic.p_fref[0][j][i?4:0] = plane_src + ref_pix_offset[j&1]; + if( !i ) { for( int k = 1; k < 4; k++ ) - h->mb.pic.p_fref[0][j][k] = &h->fref[0][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[0][j][k] = filtered_src[k] + ref_pix_offset[j&1]; if( h->sh.weight[j][0].weightfn ) - h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> b_interlaced][ref_pix_offset[j&1]]; + h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> MB_INTERLACED][ref_pix_offset[j&1]]; else h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0]; } @@ -542,57 +570,160 @@ if( h->sh.i_type == SLICE_TYPE_B ) for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) { - h->mb.pic.p_fref[1][j][i?4:0] = &h->fref[1][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( MB_INTERLACED ) + { + plane_src = h->fref[1][j>>1]->plane_fld[i]; + filtered_src = h->fref[1][j>>1]->filtered_fld; + } + else + { + plane_src = h->fref[1][j]->plane[i]; + filtered_src = h->fref[1][j]->filtered; + } + h->mb.pic.p_fref[1][j][i?4:0] = plane_src + ref_pix_offset[j&1]; + if( !i ) for( int k = 1; k < 4; k++ ) - h->mb.pic.p_fref[1][j][k] = &h->fref[1][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[1][j][k] = filtered_src[k] + ref_pix_offset[j&1]; } } -static void inline x264_macroblock_cache_load_neighbours( x264_t h, int mb_x, int mb_y ) +static const x264_left_table_t left_indices[4] = { - int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x; + /* Current is progressive / + {{ 4, 4, 5, 5}, { 3, 3, 7, 7}, {16+1, 16+1, 16+4+1, 16+4+1}, {0, 0, 1, 1}, {0, 0, 0, 0}}, + {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+3, 16+3, 16+4+3, 16+4+3}, {2, 2, 3, 3}, {1, 1, 1, 1}}, + / Current is interlaced / + {{ 4, 6, 4, 6}, { 3, 11, 3, 11}, {16+1, 16+1, 16+4+1, 16+4+1}, {0, 2, 0, 2}, {0, 1, 0, 1}}, + / Both same / + {{ 4, 5, 6, 3}, { 3, 7, 11, 15}, {16+1, 16+3, 16+4+1, 16+4+3}, {0, 1, 2, 3}, {0, 0, 1, 1}} +}; + +static void ALWAYS_INLINE x264_macroblock_cache_load_neighbours( x264_t h, int mb_x, int mb_y, int b_interlaced ) +{ + const int mb_interlaced = b_interlaced && MB_INTERLACED; + int top_y = mb_y - (1 << mb_interlaced); + int top = top_y * h->mb.i_mb_stride + mb_x; h->mb.i_mb_x = mb_x; h->mb.i_mb_y = mb_y; h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; h->mb.i_b8_xy = 2(mb_y h->mb.i_b8_stride + mb_x); h->mb.i_b4_xy = 4(mb_y h->mb.i_b4_stride + mb_x); + h->mb.left_b8[0] = + h->mb.left_b8[1] = -1; + h->mb.left_b4[0] = + h->mb.left_b4[1] = -1; h->mb.i_neighbour = 0; h->mb.i_neighbour_intra = 0; h->mb.i_neighbour_frame = 0; h->mb.i_mb_top_xy = -1; - h->mb.i_mb_left_xy = -1; + h->mb.i_mb_top_y = -1; + h->mb.i_mb_left_xy[0] = h->mb.i_mb_left_xy[1] = -1; h->mb.i_mb_topleft_xy = -1; h->mb.i_mb_topright_xy = -1; h->mb.i_mb_type_top = -1; - h->mb.i_mb_type_left = -1; + h->mb.i_mb_type_left[0] = h->mb.i_mb_type_left[1] = -1; h->mb.i_mb_type_topleft = -1; h->mb.i_mb_type_topright = -1; + h->mb.left_index_table = &left_indices[3]; + h->mb.topleft_partition = 0; + + int topleft_y = top_y; + int topright_y = top_y; + int left[2]; + + left[0] = left[1] = h->mb.i_mb_xy - 1; + h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2; + h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4; + + if( b_interlaced ) + { + h->mb.i_mb_top_mbpair_xy = h->mb.i_mb_xy - 2h->mb.i_mb_stride; + h->mb.i_mb_topleft_y = -1; + h->mb.i_mb_topright_y = -1; + + if( mb_y&1 ) + { + if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) + { + left[0] = left[1] = h->mb.i_mb_xy - 1 - h->mb.i_mb_stride; + h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2 - 2h->mb.i_b8_stride; + h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4 - 4h->mb.i_b4_stride; + + if( mb_interlaced ) + { + h->mb.left_index_table = &left_indices[2]; + left[1] += h->mb.i_mb_stride; + h->mb.left_b8[1] += 2h->mb.i_b8_stride; + h->mb.left_b4[1] += 4h->mb.i_b4_stride; + } + else + { + h->mb.left_index_table = &left_indices[1]; + topleft_y++; + h->mb.topleft_partition = 1; + } + } + if( !mb_interlaced ) + topright_y = -1; + } + else + { + if( mb_interlaced && top >= 0 ) + { + if( !h->mb.field[top] ) + { + top += h->mb.i_mb_stride; + top_y++; + } + if( mb_x ) + topleft_y += !h->mb.field[h->mb.i_mb_stridetopleft_y + mb_x - 1]; + if( mb_x < h->mb.i_mb_width-1 ) + topright_y += !h->mb.field[h->mb.i_mb_stridetopright_y + mb_x + 1]; + } + if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) + { + if( mb_interlaced ) + { + h->mb.left_index_table = &left_indices[2]; + left[1] += h->mb.i_mb_stride; + h->mb.left_b8[1] += 2h->mb.i_b8_stride; + h->mb.left_b4[1] += 4h->mb.i_b4_stride; + } + else + h->mb.left_index_table = &left_indices[0]; + } + } + } if( mb_x > 0 ) { h->mb.i_neighbour_frame \|= MB_LEFT; - h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1; - h->mb.i_mb_type_left = h->mb.type[h->mb.i_mb_left_xy]; - if( h->mb.i_mb_xy > h->sh.i_first_mb ) + h->mb.i_mb_left_xy[0] = left[0]; + h->mb.i_mb_left_xy[1] = left[1]; + h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]]; + h->mb.i_mb_type_left[1] = h->mb.type[h->mb.i_mb_left_xy[1]]; + if( h->mb.slice_table[left[0]] == h->sh.i_first_mb ) { h->mb.i_neighbour \|= MB_LEFT; - if( !h->param.b_constrained_intra \|\| IS_INTRA( h->mb.i_mb_type_left ) ) + // FIXME: We don't currently support constrained intra + mbaff. + if( !h->param.b_constrained_intra \|\| IS_INTRA( h->mb.i_mb_type_left[0] ) ) h->mb.i_neighbour_intra \|= MB_LEFT; } } / We can't predict from the previous threadslice since it hasn't been encoded yet. / - if( (h->i_threadslice_start >> h->mb.b_interlaced) != (mb_y >> h->mb.b_interlaced) ) + if( (h->i_threadslice_start >> mb_interlaced) != (mb_y >> mb_interlaced) ) { if( top >= 0 ) { h->mb.i_neighbour_frame \|= MB_TOP; h->mb.i_mb_top_xy = top; + h->mb.i_mb_top_y = top_y; h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy]; - if( top >= h->sh.i_first_mb ) + if( h->mb.slice_table[top] == h->sh.i_first_mb ) { h->mb.i_neighbour \|= MB_TOP; @@ -611,12 +742,13 @@ } } - if( mb_x > 0 && top - 1 >= 0 ) + if( mb_x > 0 && topleft_y >= 0 ) { h->mb.i_neighbour_frame \|= MB_TOPLEFT; - h->mb.i_mb_topleft_xy = top - 1; + h->mb.i_mb_topleft_xy = h->mb.i_mb_stridetopleft_y + mb_x - 1; + h->mb.i_mb_topleft_y = topleft_y; h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topleft_xy]; - if( top - 1 >= h->sh.i_first_mb ) + if( h->mb.slice_table[h->mb.i_mb_topleft_xy] == h->sh.i_first_mb ) { h->mb.i_neighbour \|= MB_TOPLEFT; @@ -625,12 +757,13 @@ } } - if( mb_x < h->mb.i_mb_width - 1 && top + 1 >= 0 ) + if( mb_x < h->mb.i_mb_width - 1 && topright_y >= 0 ) { h->mb.i_neighbour_frame \|= MB_TOPRIGHT; - h->mb.i_mb_topright_xy = top + 1; + h->mb.i_mb_topright_xy = h->mb.i_mb_stridetopright_y + mb_x + 1; + h->mb.i_mb_topright_y = topright_y; h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topright_xy]; - if( top + 1 >= h->sh.i_first_mb ) + if( h->mb.slice_table[h->mb.i_mb_topright_xy] == h->sh.i_first_mb ) { h->mb.i_neighbour \|= MB_TOPRIGHT; @@ -641,13 +774,20 @@ } } -void x264_macroblock_cache_load( x264_t h, int mb_x, int mb_y ) +#define LTOP 0 +#if HAVE_INTERLACED +# define LBOT 1 +#else +# define LBOT 0 +#endif + +void ALWAYS_INLINE x264_macroblock_cache_load( x264_t h, int mb_x, int mb_y, int b_mbaff ) { - x264_macroblock_cache_load_neighbours( h, mb_x, mb_y ); + x264_macroblock_cache_load_neighbours( h, mb_x, mb_y, b_mbaff ); - int left = h->mb.i_mb_left_xy; + int left = h->mb.i_mb_left_xy; int top = h->mb.i_mb_top_xy; - int top_y = mb_y - (1 << h->mb.b_interlaced); + int top_y = h->mb.i_mb_top_y; int s8x8 = h->mb.i_b8_stride; int s4x4 = h->mb.i_b4_stride; int top_8x8 = (2top_y+1) s8x8 + 2mb_x; @@ -660,6 +800,8 @@ uint8_t (nnz)[24] = h->mb.non_zero_count; int16_t cbp = h->mb.cbp; + const x264_left_table_t left_index_table = h->mb.left_index_table; + /* load cache / if( h->mb.i_neighbour & MB_TOP ) { @@ -699,25 +841,53 @@ if( h->mb.i_neighbour & MB_LEFT ) { - h->mb.cache.i_cbp_left = cbp[left]; + if( b_mbaff ) + { + const int16_t top_luma = (cbp[left[LTOP]] >> (left_index_table->mv[0]&(~1))) & 2; + const int16_t bot_luma = (cbp[left[LBOT]] >> (left_index_table->mv[2]&(~1))) & 2; + h->mb.cache.i_cbp_left = (cbp[left[LTOP]] & 0xfff0) \| (bot_luma<<2) \| top_luma; + } + else + h->mb.cache.i_cbp_left = cbp[left[0]]; + if( b_mbaff ) + { + / load intra4x4 / + h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left[LTOP]][left_index_table->intra[0]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left[LTOP]][left_index_table->intra[1]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left[LBOT]][left_index_table->intra[2]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left[LBOT]][left_index_table->intra[3]]; + + / load non_zero_count / + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[LTOP]][left_index_table->nnz[0]]; + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[LTOP]][left_index_table->nnz[1]]; + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[LBOT]][left_index_table->nnz[2]]; + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[LBOT]][left_index_table->nnz[3]]; - / load intra4x4 / - h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3]; + h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left[LTOP]][left_index_table->nnz_chroma[0]]; + h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left[LBOT]][left_index_table->nnz_chroma[1]]; - / load non_zero_count / - h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3]; - h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7]; - h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11]; - h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15]; + h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left[LTOP]][left_index_table->nnz_chroma[2]]; + h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left[LBOT]][left_index_table->nnz_chroma[3]]; + } + else + { + int l = left[0]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[l][left_index_table->intra[0]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[l][left_index_table->intra[1]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[l][left_index_table->intra[2]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[l][left_index_table->intra[3]]; + + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[l][left_index_table->nnz[0]]; + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[l][left_index_table->nnz[1]]; + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[l][left_index_table->nnz[2]]; + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[l][left_index_table->nnz[3]]; - h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1]; - h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3]; + h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[l][left_index_table->nnz_chroma[0]]; + h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[l][left_index_table->nnz_chroma[1]]; - h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1]; - h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3]; + h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[l][left_index_table->nnz_chroma[2]]; + h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[l][left_index_table->nnz_chroma[3]]; + } } else { @@ -742,20 +912,17 @@ if( h->pps->b_transform_8x8_mode ) { h->mb.cache.i_neighbour_transform_size = - ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] ) + ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left[0]] ) + ( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ); } - if( h->sh.b_mbaff ) + if( b_mbaff ) { - h->mb.pic.i_fref[0] = h->i_ref[0] << h->mb.b_interlaced; - h->mb.pic.i_fref[1] = h->i_ref[1] << h->mb.b_interlaced; - h->mb.cache.i_neighbour_interlaced = - !!(h->mb.i_neighbour & MB_LEFT) - + !!(h->mb.i_neighbour & MB_TOP); + h->mb.pic.i_fref[0] = h->i_ref[0] << MB_INTERLACED; + h->mb.pic.i_fref[1] = h->i_ref[1] << MB_INTERLACED; } - if( !h->mb.b_interlaced ) + if( !b_mbaff ) { x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[0]-1+12FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12FDEC_STRIDE ); @@ -789,8 +956,17 @@ int i8 = x264_scan8[0] - 1 - 18; if( h->mb.i_neighbour & MB_TOPLEFT ) { - h->mb.cache.ref[l][i8] = ref[top_8x8 - 1]; - CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] ); + int ir = b_mbaff ? 2(s8x8h->mb.i_mb_topleft_y + mb_x-1)+1+s8x8 : top_8x8 - 1; + int iv = b_mbaff ? 4(s4x4h->mb.i_mb_topleft_y + mb_x-1)+3+3s4x4 : top_4x4 - 1; + if( b_mbaff && h->mb.topleft_partition ) + { + / Take motion vector from the middle of macroblock instead of + * the bottom right as usual. / + iv -= 2s4x4; + ir -= s8x8; + } + h->mb.cache.ref[l][i8] = ref[ir]; + CP32( h->mb.cache.mv[l][i8], mv[iv] ); } else { @@ -816,8 +992,10 @@ i8 = x264_scan8[0] + 4 - 18; if( h->mb.i_neighbour & MB_TOPRIGHT ) { - h->mb.cache.ref[l][i8] = ref[top_8x8 + 2]; - CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] ); + int ir = b_mbaff ? 2(s8x8h->mb.i_mb_topright_y + (mb_x+1))+s8x8 : top_8x8 + 2; + int iv = b_mbaff ? 4(s4x4h->mb.i_mb_topright_y + (mb_x+1))+3s4x4 : top_4x4 + 4; + h->mb.cache.ref[l][i8] = ref[ir]; + CP32( h->mb.cache.mv[l][i8], mv[iv] ); } else h->mb.cache.ref[l][i8] = -2; @@ -825,17 +1003,32 @@ i8 = x264_scan8[0] - 1; if( h->mb.i_neighbour & MB_LEFT ) { - const int ir = h->mb.i_b8_xy - 1; - const int iv = h->mb.i_b4_xy - 1; - h->mb.cache.ref[l][i8+08] = - h->mb.cache.ref[l][i8+18] = ref[ir + 0s8x8]; - h->mb.cache.ref[l][i8+28] = - h->mb.cache.ref[l][i8+38] = ref[ir + 1s8x8]; - - CP32( h->mb.cache.mv[l][i8+08], mv[iv + 0s4x4] ); - CP32( h->mb.cache.mv[l][i8+18], mv[iv + 1s4x4] ); - CP32( h->mb.cache.mv[l][i8+28], mv[iv + 2s4x4] ); - CP32( h->mb.cache.mv[l][i8+38], mv[iv + 3s4x4] ); + if( b_mbaff ) + { + h->mb.cache.ref[l][i8+08] = ref[h->mb.left_b8[LTOP] + 1 + s8x8left_index_table->ref[0]]; + h->mb.cache.ref[l][i8+18] = ref[h->mb.left_b8[LTOP] + 1 + s8x8left_index_table->ref[1]]; + h->mb.cache.ref[l][i8+28] = ref[h->mb.left_b8[LBOT] + 1 + s8x8left_index_table->ref[2]]; + h->mb.cache.ref[l][i8+38] = ref[h->mb.left_b8[LBOT] + 1 + s8x8left_index_table->ref[3]]; + + CP32( h->mb.cache.mv[l][i8+08], mv[h->mb.left_b4[LTOP] + 3 + s4x4left_index_table->mv[0]] ); + CP32( h->mb.cache.mv[l][i8+18], mv[h->mb.left_b4[LTOP] + 3 + s4x4left_index_table->mv[1]] ); + CP32( h->mb.cache.mv[l][i8+28], mv[h->mb.left_b4[LBOT] + 3 + s4x4left_index_table->mv[2]] ); + CP32( h->mb.cache.mv[l][i8+38], mv[h->mb.left_b4[LBOT] + 3 + s4x4left_index_table->mv[3]] ); + } + else + { + const int ir = h->mb.i_b8_xy - 1; + const int iv = h->mb.i_b4_xy - 1; + h->mb.cache.ref[l][i8+08] = + h->mb.cache.ref[l][i8+18] = ref[ir + 0s8x8]; + h->mb.cache.ref[l][i8+28] = + h->mb.cache.ref[l][i8+38] = ref[ir + 1s8x8]; + + CP32( h->mb.cache.mv[l][i8+08], mv[iv + 0s4x4] ); + CP32( h->mb.cache.mv[l][i8+18], mv[iv + 1s4x4] ); + CP32( h->mb.cache.mv[l][i8+28], mv[iv + 2s4x4] ); + CP32( h->mb.cache.mv[l][i8+38], mv[iv + 3s4x4] ); + } } else { @@ -846,6 +1039,39 @@ } } + /* Extra logic for top right mv in mbaff. + * . . . d . . a . + * . . . e . . . . + * . . . f b . c . + * . . . . . . . . + * + * If the top right of the 4x4 partitions labeled a, b and c in the + * above diagram do not exist, but the entries d, e and f exist (in + * the macroblock to the left) then use those instead. + / + if( b_mbaff && (h->mb.i_neighbour & MB_LEFT) ) + { + if( MB_INTERLACED && !h->mb.field[h->mb.i_mb_xy-1] ) + { + h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x80]; + h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x81]; + h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[1] + 1 + s8x80]; + CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4(left_index_table->mv[0]+1)] ); + CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4(left_index_table->mv[1]+1)] ); + CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[1] + 3 + s4x4(left_index_table->mv[2]+1)] ); + } + else if( !MB_INTERLACED && h->mb.field[h->mb.i_mb_xy-1] ) + { + // Looking at the bottom field so always take the bottom macroblock of the pair. + h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x82 + s8x8left_index_table->ref[0]]; + h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x82 + s8x8left_index_table->ref[0]]; + h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x82 + s8x8left_index_table->ref[2]]; + CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x44 + s4x4left_index_table->mv[0]] ); + CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x44 + s4x4left_index_table->mv[1]] ); + CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[0] + 3 + s4x44 + s4x4left_index_table->mv[2]] ); + } + } + if( h->param.b_cabac ) { uint8_t (mvd)[8][2] = h->mb.mvd[l]; @@ -854,31 +1080,169 @@ else M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0; + if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff \|\| h->mb.cache.ref[l][x264_scan8[0]-1] >= 0) ) + { + CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[LTOP]][left_index_table->intra[0]] ); + CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[LTOP]][left_index_table->intra[1]] ); + } + else + { + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+08] ) = 0; + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+18] ) = 0; + } + if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff \|\| h->mb.cache.ref[l][x264_scan8[0]-1+28] >=0) ) + { + CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[LBOT]][left_index_table->intra[2]] ); + CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[LBOT]][left_index_table->intra[3]] ); + } + else + { + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+28] ) = 0; + M16( h->mb.cache.mvd[l][x264_scan8[0]-1+38] ) = 0; + } + } + + / If motion vectors are cached from frame macroblocks but this + * macroblock is a field macroblock then the motion vector must be + * halved. Similarly, motion vectors from field macroblocks are doubled. / + if( b_mbaff ) + { +#define MAP_MVS\ + if( FIELD_DIFFERENT(h->mb.i_mb_topleft_xy) )\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 - 18)\ + if( FIELD_DIFFERENT(top) )\ + {\ + MAP_F2F(mv, ref, x264_scan8[0] + 0 - 18)\ + MAP_F2F(mv, ref, x264_scan8[0] + 1 - 18)\ + MAP_F2F(mv, ref, x264_scan8[0] + 2 - 18)\ + MAP_F2F(mv, ref, x264_scan8[0] + 3 - 18)\ + }\ + if( FIELD_DIFFERENT(h->mb.i_mb_topright_xy) )\ + MAP_F2F(mv, ref, x264_scan8[0] + 4 - 18)\ + if( FIELD_DIFFERENT(left[0]) )\ + {\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 08)\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 18)\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 28)\ + MAP_F2F(mv, ref, x264_scan8[0] - 1 + 38)\ + MAP_F2F(topright_mv, topright_ref, 0)\ + MAP_F2F(topright_mv, topright_ref, 1)\ + MAP_F2F(topright_mv, topright_ref, 2)\ + } + + if( MB_INTERLACED ) + { +#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && !h->mb.field[macroblock]) +#define MAP_F2F(varmv, varref, index)\ + if( h->mb.cache.varref[l][index] >= 0 )\ + {\ + h->mb.cache.varref[l][index] <<= 1;\ + h->mb.cache.varmv[l][index][1] /= 2;\ + h->mb.cache.mvd[l][index][1] >>= 1;\ + } + MAP_MVS +#undef MAP_F2F +#undef FIELD_DIFFERENT + } + else + { +#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && h->mb.field[macroblock]) +#define MAP_F2F(varmv, varref, index)\ + if( h->mb.cache.varref[l][index] >= 0 )\ + {\ + h->mb.cache.varref[l][index] >>= 1;\ + h->mb.cache.varmv[l][index][1] <<= 1;\ + h->mb.cache.mvd[l][index][1] <<= 1;\ + } + MAP_MVS +#undef MAP_F2F +#undef FIELD_DIFFERENT + } + } + } + + if( b_mbaff && mb_x == 0 && !(mb_y&1) && mb_y > 0 ) + h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_xy - h->mb.i_mb_stride]; + + / Check whether skip here would cause decoder to predict interlace mode incorrectly. + * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. / + h->mb.b_allow_skip = 1; + if( b_mbaff ) + { + if( MB_INTERLACED != h->mb.field_decoding_flag && + h->mb.i_mb_prev_xy >= 0 && IS_SKIP(h->mb.type[h->mb.i_mb_prev_xy]) ) + h->mb.b_allow_skip = 0; + if( (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) ) + { if( h->mb.i_neighbour & MB_LEFT ) { - CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] ); - CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] ); - CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] ); - CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] ); + if( h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.b_allow_skip = 0; + } + else if( h->mb.i_neighbour & MB_TOP ) + { + if( h->mb.field[h->mb.i_mb_top_xy] != MB_INTERLACED ) + h->mb.b_allow_skip = 0; + } + else // Frame mb pair is predicted + { + if( MB_INTERLACED ) + h->mb.b_allow_skip = 0; + } + } + } + + if( h->param.b_cabac ) + { + if( b_mbaff ) + { + int left_xy, top_xy; + / Neighbours here are calculated based on field_decoding_flag / + int mb_xy = mb_x + (mb_y&~1)h->mb.i_mb_stride; + left_xy = mb_xy - 1; + if( (mb_y&1) && mb_x > 0 && h->mb.field_decoding_flag == h->mb.field[left_xy] ) + left_xy += h->mb.i_mb_stride; + if( h->mb.field_decoding_flag ) + { + top_xy = mb_xy - h->mb.i_mb_stride; + if( !(mb_y&1) && top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && h->mb.field[top_xy] ) + top_xy -= h->mb.i_mb_stride; } else - for( int i = 0; i < 4; i++ ) - M16( h->mb.cache.mvd[l][x264_scan8[0]-1+i8] ) = 0; + top_xy = mb_x + (mb_y-1)h->mb.i_mb_stride; + + h->mb.cache.i_neighbour_skip = (mb_x > 0 && h->mb.slice_table[left_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[left_xy] )) + + (top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[top_xy] )); + } + else + { + h->mb.cache.i_neighbour_skip = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] )) + + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top )); } } /* load skip / if( h->sh.i_type == SLICE_TYPE_B ) { - h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(mb_y&1)]; - h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(mb_y&1)]; + h->mb.bipred_weight = h->mb.bipred_weight_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; + h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; if( h->param.b_cabac ) { uint8_t skipbp; x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 ); - skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left] : 0; - h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2; - h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8; + if( b_mbaff ) + { + skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LTOP]] : 0; + h->mb.cache.skip[x264_scan8[0] - 1] = (skipbp >> (1+(left_index_table->mv[0]&~1))) & 1; + skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LBOT]] : 0; + h->mb.cache.skip[x264_scan8[8] - 1] = (skipbp >> (1+(left_index_table->mv[2]&~1))) & 1; + } + else + { + skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0; + h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2; + h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8; + } skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0; h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4; h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8; @@ -902,36 +1266,67 @@ \| ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP\|MB_TOPLEFT : 0); } +void x264_macroblock_cache_load_progressive( x264_t h, int mb_x, int mb_y ) +{ + x264_macroblock_cache_load( h, mb_x, mb_y, 0 ); +} + +void x264_macroblock_cache_load_interlaced( x264_t h, int mb_x, int mb_y ) +{ + x264_macroblock_cache_load( h, mb_x, mb_y, 1 ); +} + void x264_macroblock_cache_load_neighbours_deblock( x264_t h, int mb_x, int mb_y ) { int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2; h->mb.i_neighbour = 0; h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; - - if( mb_x > 0 ) + h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy]; + h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED); + h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_strideh->mb.i_mb_top_y; + h->mb.i_mb_left_xy[1] = + h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1; + if( SLICE_MBAFF ) { - h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1; - if( deblock_on_slice_edges \|\| h->mb.slice_table[h->mb.i_mb_left_xy] == h->mb.slice_table[h->mb.i_mb_xy] ) - h->mb.i_neighbour \|= MB_LEFT; + if( mb_y&1 ) + { + if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride; + } + else + { + if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] ) + { + h->mb.i_mb_top_xy += h->mb.i_mb_stride; + h->mb.i_mb_top_y++; + } + if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride; + } } - if( mb_y > h->mb.b_interlaced ) - { - h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced); - if( deblock_on_slice_edges \|\| h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] ) - h->mb.i_neighbour \|= MB_TOP; - } + if( mb_x > 0 && (deblock_on_slice_edges \|\| + h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) ) + h->mb.i_neighbour \|= MB_LEFT; + if( mb_y > MB_INTERLACED && (deblock_on_slice_edges + \|\| h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) ) + h->mb.i_neighbour \|= MB_TOP; } -void x264_macroblock_cache_load_deblock( x264_t h ) +void x264_macroblock_deblock_strength( x264_t h ) { + uint8_t (bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) ) - return; + { + memset( bs[0], 3, 44sizeof(uint8_t) ); + memset( bs[1], 3, 44sizeof(uint8_t) ); + if( !SLICE_MBAFF ) return; + } /* If we have multiple slices and we're deblocking on slice edges, we * have to reload neighbour data. / - if( h->sh.i_first_mb && h->sh.i_disable_deblocking_filter_idc != 2 ) + if( SLICE_MBAFF \|\| (h->sh.i_first_mb && h->sh.i_disable_deblocking_filter_idc != 2) ) { int old_neighbour = h->mb.i_neighbour; int mb_x = h->mb.i_mb_x; @@ -941,24 +1336,25 @@ h->mb.i_neighbour &= ~old_neighbour; if( h->mb.i_neighbour ) { - int top_y = mb_y - (1 << h->mb.b_interlaced); + int top_y = h->mb.i_mb_top_y; int top_8x8 = (2top_y+1) * h->mb.i_b8_stride + 2mb_x; int top_4x4 = (4top_y+3) * h->mb.i_b4_stride + 4mb_x; int s8x8 = h->mb.i_b8_stride; int s4x4 = h->mb.i_b4_stride; uint8_t (nnz)[24] = h->mb.non_zero_count; + const x264_left_table_t left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3]; if( h->mb.i_neighbour & MB_TOP ) CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] ); if( h->mb.i_neighbour & MB_LEFT ) { - int left = h->mb.i_mb_left_xy; - h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3]; - h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7]; - h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11]; - h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15]; + int left = h->mb.i_mb_left_xy; + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]]; + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]]; + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]]; + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]]; } for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) @@ -979,17 +1375,15 @@ i8 = x264_scan8[0] - 1; if( h->mb.i_neighbour & MB_LEFT ) { - int ir = h->mb.i_b8_xy - 1; - int iv = h->mb.i_b4_xy - 1; h->mb.cache.ref[l][i8+08] = - h->mb.cache.ref[l][i8+18] = ref[ir + 0s8x8]; + h->mb.cache.ref[l][i8+18] = ref[h->mb.left_b8[0] + 1 + s8x8left_index_table->ref[0]]; h->mb.cache.ref[l][i8+28] = - h->mb.cache.ref[l][i8+38] = ref[ir + 1s8x8]; + h->mb.cache.ref[l][i8+38] = ref[h->mb.left_b8[1] + 1 + s8x8left_index_table->ref[2]]; - CP32( h->mb.cache.mv[l][i8+08], mv[iv + 0s4x4] ); - CP32( h->mb.cache.mv[l][i8+18], mv[iv + 1s4x4] ); - CP32( h->mb.cache.mv[l][i8+28], mv[iv + 2s4x4] ); - CP32( h->mb.cache.mv[l][i8+38], mv[iv + 3s4x4] ); + CP32( h->mb.cache.mv[l][i8+08], mv[h->mb.left_b4[0] + 3 + s4x4left_index_table->mv[0]] ); + CP32( h->mb.cache.mv[l][i8+18], mv[h->mb.left_b4[0] + 3 + s4x4left_index_table->mv[1]] ); + CP32( h->mb.cache.mv[l][i8+28], mv[h->mb.left_b4[1] + 3 + s4x4left_index_table->mv[2]] ); + CP32( h->mb.cache.mv[l][i8+38], mv[h->mb.left_b4[1] + 3 + s4x4left_index_table->mv[3]] ); } } } @@ -1029,7 +1423,7 @@ { uint8_t (nnz)[24] = h->mb.non_zero_count; int top = h->mb.i_mb_top_xy; - int left = h->mb.i_mb_left_xy; + int left = h->mb.i_mb_left_xy; if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ) { @@ -1040,15 +1434,21 @@ M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0; } - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] ) + if( h->mb.i_neighbour & MB_LEFT ) { int i8 = x264_scan8[0] - 1; - int nnz_left0 = M16( &nnz[left][2] ) \| M16( &nnz[left][6] ); - int nnz_left1 = M16( &nnz[left][10] ) \| M16( &nnz[left][14] ); - h->mb.cache.non_zero_count[i8+80] = !!nnz_left0; - h->mb.cache.non_zero_count[i8+81] = !!nnz_left0; - h->mb.cache.non_zero_count[i8+82] = !!nnz_left1; - h->mb.cache.non_zero_count[i8+83] = !!nnz_left1; + if( h->mb.mb_transform_size[left[0]] ) + { + int nnz_left0 = M16( &nnz[left[0]][2] ) \| M16( &nnz[left[0]][6] ); + h->mb.cache.non_zero_count[i8+80] = !!nnz_left0; + h->mb.cache.non_zero_count[i8+81] = !!nnz_left0; + } + if( h->mb.mb_transform_size[left[1]] ) + { + int nnz_left1 = M16( &nnz[left[1]][10] ) \| M16( &nnz[left[1]][14] ); + h->mb.cache.non_zero_count[i8+82] = !!nnz_left1; + h->mb.cache.non_zero_count[i8+83] = !!nnz_left1; + } } if( h->mb.mb_transform_size[h->mb.i_mb_xy] ) @@ -1066,43 +1466,55 @@ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+83] ) = nnzbot; } } -} -static void ALWAYS_INLINE twiddle_topleft_pixel( pixel dst, pixel src, int b_interlaced ) -{ - // We update intra_border_backup in-place, so the topleft neighbor will no longer - // exist there when load_pic_pointers wants it. Move it within p_fdec instead. - if( b_interlaced ) - { - dst[0] = dst[-1]; - dst[-1] = src[0]; - } - else - dst[0] = src[0]; + int mvy_limit = 4 >> MB_INTERLACED; + h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, + bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B, h ); } -static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t h, int mb_x, int mb_y, int i, int b_interlaced ) +static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t h, int mb_x, int mb_y, int i, int b_mbaff ) { int w = i ? 8 : 16; int i_stride = h->fdec->i_stride[i]; - int i_stride2 = i_stride << b_interlaced; - int i_pix_offset = b_interlaced + int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED); + int i_pix_offset = (b_mbaff && MB_INTERLACED) ? 16 mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride : 16 * mb_x + w * mb_y * i_stride; - pixel intra_fdec = &h->intra_border_backup[mb_y&1][i][mb_x16]; if( i ) - { h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] ); - memcpy( intra_fdec, h->mb.pic.p_fdec[1]+FDEC_STRIDE7, 8sizeof(pixel) ); - memcpy( intra_fdec+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE7, 8sizeof(pixel) ); - twiddle_topleft_pixel( h->mb.pic.p_fdec[1]-FDEC_STRIDE-1, h->mb.pic.p_fdec[1]-FDEC_STRIDE+7, b_interlaced ); - twiddle_topleft_pixel( h->mb.pic.p_fdec[2]-FDEC_STRIDE-1, h->mb.pic.p_fdec[2]-FDEC_STRIDE+7, b_interlaced ); + else + h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); +} + +static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t h, int mb_x, int mb_y, int b_mbaff ) +{ + / In MBAFF we store the last two rows in intra_border_backup[0] and [1]. + * For progressive mbs this is the bottom two rows, and for interlaced the + * bottom row of each field. We also store samples needed for the next + * mbpair in intra_border_backup[2]. / + int backup_dst = !b_mbaff ? 0 : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2; + memcpy( &h->intra_border_backup[backup_dst][0][mb_x16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE15, 16sizeof(pixel) ); + memcpy( &h->intra_border_backup[backup_dst][1][mb_x16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE7, 8sizeof(pixel) ); + memcpy( &h->intra_border_backup[backup_dst][1][mb_x16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE7, 8sizeof(pixel) ); + if( b_mbaff ) + { + if( mb_y&1 ) + { + int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE; + backup_dst = MB_INTERLACED ? 2 : 0; + memcpy( &h->intra_border_backup[backup_dst][0][mb_x16 ], h->mb.pic.p_fdec[0]+backup_src, 16sizeof(pixel) ); + backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE; + memcpy( &h->intra_border_backup[backup_dst][1][mb_x16 ], h->mb.pic.p_fdec[1]+backup_src, 8sizeof(pixel) ); + memcpy( &h->intra_border_backup[backup_dst][1][mb_x16+8], h->mb.pic.p_fdec[2]+backup_src, 8sizeof(pixel) ); + } } else { - h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); - memcpy( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE15, 16sizeof(pixel) ); - twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, b_interlaced ); + /* In progressive we update intra_border_backup in-place, so the topleft neighbor will + * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. / + h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15]; + h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7]; + h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7]; } } @@ -1120,13 +1532,15 @@ int8_t i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy]; uint8_t nnz = h->mb.non_zero_count[i_mb_xy]; - if( h->mb.b_interlaced ) + if( SLICE_MBAFF ) { + x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 1 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1 ); } else { + x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0 ); x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0 ); } @@ -1285,42 +1699,43 @@ void x264_macroblock_bipred_init( x264_t h ) { - for( int field = 0; field <= h->sh.b_mbaff; field++ ) - for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<h->sh.b_mbaff); i_ref0++ ) - { - x264_frame_t l0 = h->fref[0][i_ref0>>h->sh.b_mbaff]; - int poc0 = l0->i_poc + l0->i_delta_poc[field^(i_ref0&1)]; - for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<h->sh.b_mbaff); i_ref1++ ) - { - int dist_scale_factor; - x264_frame_t l1 = h->fref[1][i_ref1>>h->sh.b_mbaff]; - int poc1 = l1->i_poc + l1->i_delta_poc[field^(i_ref1&1)]; - int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; - int td = x264_clip3( poc1 - poc0, -128, 127 ); - if( td == 0 /* \|\| pic0 is a long-term ref / ) - dist_scale_factor = 256; - else - { - int tb = x264_clip3( cur_poc - poc0, -128, 127 ); - int tx = (16384 + (abs(td) >> 1)) / td; - dist_scale_factor = x264_clip3( (tb tx + 32) >> 6, -1024, 1023 ); - } - - h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor; - - dist_scale_factor >>= 2; - if( h->param.analyse.b_weighted_bipred - && dist_scale_factor >= -64 - && dist_scale_factor <= 128 ) + for( int mbfield = 0; mbfield <= SLICE_MBAFF; mbfield++ ) + for( int field = 0; field <= SLICE_MBAFF; field++ ) + for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<mbfield); i_ref0++ ) + { + x264_frame_t l0 = h->fref[0][i_ref0>>mbfield]; + int poc0 = l0->i_poc + mbfieldl0->i_delta_poc[field^(i_ref0&1)]; + for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<mbfield); i_ref1++ ) { - h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 64 - dist_scale_factor; - // ssse3 implementation of biweight doesn't support the extrema. - // if we ever generate them, we'll have to drop that optimization. - assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); + int dist_scale_factor; + x264_frame_t l1 = h->fref[1][i_ref1>>mbfield]; + int cur_poc = h->fdec->i_poc + mbfieldh->fdec->i_delta_poc[field]; + int poc1 = l1->i_poc + mbfieldl1->i_delta_poc[field^(i_ref1&1)]; + int td = x264_clip3( poc1 - poc0, -128, 127 ); + if( td == 0 / \|\| pic0 is a long-term ref / ) + dist_scale_factor = 256; + else + { + int tb = x264_clip3( cur_poc - poc0, -128, 127 ); + int tx = (16384 + (abs(td) >> 1)) / td; + dist_scale_factor = x264_clip3( (tb tx + 32) >> 6, -1024, 1023 ); + } + + h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = dist_scale_factor; + + dist_scale_factor >>= 2; + if( h->param.analyse.b_weighted_bipred + && dist_scale_factor >= -64 + && dist_scale_factor <= 128 ) + { + h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 64 - dist_scale_factor; + // ssse3 implementation of biweight doesn't support the extrema. + // if we ever generate them, we'll have to drop that optimization. + assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); + } + else + h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32; } - else - h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 32; } - } }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/macroblock.h ^
@@ -290,8 +290,10 @@ void x264_macroblock_slice_init( x264_t h ); void x264_macroblock_thread_init( x264_t h ); -void x264_macroblock_cache_load( x264_t h, int mb_x, int mb_y ); -void x264_macroblock_cache_load_deblock( x264_t h ); +void x264_macroblock_cache_load( x264_t h, int mb_x, int mb_y, int b_interlaced ); +void x264_macroblock_cache_load_progressive( x264_t h, int mb_x, int mb_y ); +void x264_macroblock_cache_load_interlaced( x264_t h, int mb_x, int mb_y ); +void x264_macroblock_deblock_strength( x264_t h ); void x264_macroblock_cache_load_neighbours_deblock( x264_t h, int mb_x, int mb_y ); void x264_macroblock_cache_save( x264_t h );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/mc.c ^
@@ -511,18 +511,17 @@ void x264_frame_filter( x264_t h, x264_frame_t frame, int mb_y, int b_end ) { - const int b_interlaced = h->sh.b_mbaff; - const int stride = frame->i_stride[0] << b_interlaced; + const int b_interlaced = PARAM_INTERLACED; + int stride = frame->i_stride[0]; const int width = frame->i_width[0]; - int start = (mb_y16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 - int height = ((b_end ? frame->i_lines[0] : mb_y16) >> b_interlaced) + 8; + int start = mb_y16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 + int height = (b_end ? frame->i_lines[0] + 16PARAM_INTERLACED : (mb_y+b_interlaced)16) + 8; int offs = startstride - 8; // buffer = 3 for 6tap, aligned to 8 for simd if( mb_y & b_interlaced ) return; - for( int y = 0; y <= b_interlaced; y++, offs += frame->i_stride[0] ) - { + if( !b_interlaced \|\| h->mb.b_adaptive_mbaff ) h->mc.hpel_filter( frame->filtered[1] + offs, frame->filtered[2] + offs, @@ -530,6 +529,24 @@ frame->plane[0] + offs, stride, width + 16, height - start, h->scratch_buffer ); + + if( b_interlaced ) + { + /* MC must happen between pixels in the same field. / + stride = frame->i_stride[0] << 1; + start = (mb_y16 >> 1) - 8; + int height_fld = ((b_end ? frame->i_lines[0] : mb_y16) >> 1) + 8; + offs = startstride - 8; + for( int i = 0; i < 2; i++, offs += frame->i_stride[0] ) + { + h->mc.hpel_filter( + frame->filtered_fld[1] + offs, + frame->filtered_fld[2] + offs, + frame->filtered_fld[3] + offs, + frame->plane_fld[0] + offs, + stride, width + 16, height_fld - start, + h->scratch_buffer ); + } } /* generate integral image:
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/mvpred.c ^
@@ -38,12 +38,33 @@ int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width]; int16_t mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width]; + // Partitions not yet reached in scan order are unavailable. if( (idx&3) >= 2 + (i_width&1) \|\| i_refc == -2 ) { i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1]; mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1]; - } + if( SLICE_MBAFF + && h->mb.cache.ref[i_list][x264_scan8[0]-1] != -2 + && MB_INTERLACED != h->mb.field[h->mb.i_mb_left_xy[0]] ) + { + if( idx == 2 ) + { + mv_c = h->mb.cache.topright_mv[i_list][0]; + i_refc = h->mb.cache.topright_ref[i_list][0]; + } + else if( idx == 8 ) + { + mv_c = h->mb.cache.topright_mv[i_list][1]; + i_refc = h->mb.cache.topright_ref[i_list][1]; + } + else if( idx == 10 ) + { + mv_c = h->mb.cache.topright_mv[i_list][2]; + i_refc = h->mb.cache.topright_ref[i_list][2]; + } + } + } if( h->mb.i_partition == D_16x8 ) { if( idx == 0 ) @@ -161,50 +182,95 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t h ) { - int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x; - int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x; - const int type_col = h->fref[1][0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref[1][0]->mb_partition[h->mb.i_mb_xy]; + int mb_x = h->mb.i_mb_x; + int mb_y = h->mb.i_mb_y; + int mb_xy = h->mb.i_mb_xy; + int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] }; + int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] }; + int preshift = MB_INTERLACED; + int postshift = MB_INTERLACED; + int offset = 1; + int yshift = 1; + h->mb.i_partition = partition_col[0]; + if( PARAM_INTERLACED && h->fref[1][0]->field[mb_xy] != MB_INTERLACED ) + { + if( MB_INTERLACED ) + { + mb_y = h->mb.i_mb_y&~1; + mb_xy = mb_x + h->mb.i_mb_stride * mb_y; + type_col[0] = h->fref[1][0]->mb_type[mb_xy]; + type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride]; + partition_col[0] = h->fref[1][0]->mb_partition[mb_xy]; + partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride]; + preshift = 0; + yshift = 0; + + if( (IS_INTRA(type_col[0]) \|\| partition_col[0] == D_16x16) && + (IS_INTRA(type_col[1]) \|\| partition_col[1] == D_16x16) && + partition_col[0] != D_8x8 ) + h->mb.i_partition = D_16x8; + else + h->mb.i_partition = D_8x8; + } + else + { + int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1]; + int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc) + >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc); + mb_y = (h->mb.i_mb_y&~1) + col_parity; + mb_xy = mb_x + h->mb.i_mb_stride * mb_y; + type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy]; + partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy]; + preshift = 1; + yshift = 2; + h->mb.i_partition = partition_col[0]; + } + offset = 0; + } + int i_mb_4x4 = 16 * h->mb.i_mb_stride * mb_y + 4 * mb_x; + int i_mb_8x8 = 4 * h->mb.i_mb_stride * mb_y + 2 * mb_x; x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - h->mb.i_partition = partition_col; - - if( IS_INTRA( type_col ) ) - { - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 ); - return 1; - } - /* Don't do any checks other than the ones we have to, based * on the size of the colocated partitions. * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 / - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); - + int max_i8 = (D_16x16 - h->mb.i_partition) + 1; + int step = (h->mb.i_partition == D_16x8) + 1; + int width = 4 >> ((D_16x16 - h->mb.i_partition)&1); + int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1); for( int i8 = 0; i8 < max_i8; i8 += step ) { int x8 = i8&1; int y8 = i8>>1; - int i_part_8x8 = i_mb_8x8 + x8 + y8 h->mb.i_b8_stride; + int ypart = (SLICE_MBAFF && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ? + MB_INTERLACED ? y86 : 2(h->mb.i_mb_y&1) + y8 : + 3y8; + + if( IS_INTRA( type_col[y8] ) ) + { + x264_macroblock_cache_ref( h, 2x8, 2y8, width, height, 0, 0 ); + x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 0, 0 ); + x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 1, 0 ); + continue; + } + + int i_part_8x8 = i_mb_8x8 + x8 + (ypart>>1) h->mb.i_b8_stride; int i_ref1_ref = h->fref[1][0]->ref[0][i_part_8x8]; - int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff); + int i_ref = (map_col_to_list0(i_ref1_ref>>preshift) << postshift) + (offset&i_ref1_ref&MB_INTERLACED); if( i_ref >= 0 ) { int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0]; - int16_t mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3x8 + 3y8 h->mb.i_b4_stride]; + int16_t mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3x8 + ypart * h->mb.i_b4_stride]; + int16_t mv_y = (mv_col[1]<<yshift)/2; int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; - int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; - if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] \|\| l0y-mv_col[1] > h->mb.mv_max_spel[1]) ) + int l0y = ( dist_scale_factor * mv_y + 128 ) >> 8; + if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] \|\| l0y-mv_y > h->mb.mv_max_spel[1]) ) return 0; x264_macroblock_cache_ref( h, 2x8, 2y8, width, height, 0, i_ref ); x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 0, pack16to32_mask(l0x, l0y) ); - x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); + x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_y) ); } else { @@ -220,19 +286,10 @@ return 1; } -static int x264_mb_predict_mv_direct16x16_spatial( x264_t h ) +static ALWAYS_INLINE int x264_mb_predict_mv_direct16x16_spatial( x264_t h, int b_interlaced ) { int8_t ref[2]; ALIGNED_ARRAY_8( int16_t, mv,[2],[2] ); - const int8_t l1ref0 = &h->fref[1][0]->ref[0][h->mb.i_b8_xy]; - const int8_t l1ref1 = &h->fref[1][0]->ref[1][h->mb.i_b8_xy]; - const int16_t (l1mv[2])[2] = { (const int16_t ()[2]) &h->fref[1][0]->mv[0][h->mb.i_b4_xy], - (const int16_t ()[2]) &h->fref[1][0]->mv[1][h->mb.i_b4_xy] }; - const int type_col = h->fref[1][0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref[1][0]->mb_partition[h->mb.i_mb_xy]; - - h->mb.i_partition = partition_col; - for( int i_list = 0; i_list < 2; i_list++ ) { int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; @@ -277,6 +334,50 @@ ref[i_list] = i_ref; } + int mb_x = h->mb.i_mb_x; + int mb_y = h->mb.i_mb_y; + int mb_xy = h->mb.i_mb_xy; + int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] }; + int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] }; + h->mb.i_partition = partition_col[0]; + if( b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED ) + { + if( MB_INTERLACED ) + { + mb_y = h->mb.i_mb_y&~1; + mb_xy = mb_x + h->mb.i_mb_stride mb_y; + type_col[0] = h->fref[1][0]->mb_type[mb_xy]; + type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride]; + partition_col[0] = h->fref[1][0]->mb_partition[mb_xy]; + partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride]; + + if( (IS_INTRA(type_col[0]) \|\| partition_col[0] == D_16x16) && + (IS_INTRA(type_col[1]) \|\| partition_col[1] == D_16x16) && + partition_col[0] != D_8x8 ) + h->mb.i_partition = D_16x8; + else + h->mb.i_partition = D_8x8; + } + else + { + int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1]; + int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc) + >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc); + mb_y = (h->mb.i_mb_y&~1) + col_parity; + mb_xy = mb_x + h->mb.i_mb_stride * mb_y; + type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy]; + partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy]; + h->mb.i_partition = partition_col[0]; + } + } + int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stridemb_y + mb_x) : h->mb.i_b4_xy ; + int i_mb_8x8 = b_interlaced ? 2 (h->mb.i_b8_stridemb_y + mb_x) : h->mb.i_b8_xy ; + + int8_t l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8]; + int8_t l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8]; + int16_t (l1mv[2])[2] = { (int16_t ()[2]) &h->fref[1][0]->mv[0][i_mb_4x4], + (int16_t ()[2]) &h->fref[1][0]->mv[1][i_mb_4x4] }; + if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) / { x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); @@ -296,24 +397,31 @@ return 0; } - if( !M64( mv ) \|\| IS_INTRA( type_col ) \|\| (ref[0]&&ref[1]) ) + if( !M64( mv ) \|\| (!b_interlaced && IS_INTRA( type_col[0] )) \|\| (ref[0]&&ref[1]) ) return 1; / Don't do any checks other than the ones we have to, based * on the size of the colocated partitions. * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 / - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); + int max_i8 = (D_16x16 - h->mb.i_partition) + 1; + int step = (h->mb.i_partition == D_16x8) + 1; + int width = 4 >> ((D_16x16 - h->mb.i_partition)&1); + int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1); / col_zero_flag / for( int i8 = 0; i8 < max_i8; i8 += step ) { const int x8 = i8&1; const int y8 = i8>>1; - const int o8 = x8 + y8 h->mb.i_b8_stride; - const int o4 = 3(x8 + y8 h->mb.i_b4_stride); + int ypart = (b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ? + MB_INTERLACED ? y86 : 2(h->mb.i_mb_y&1) + y8 : + 3y8; + int o8 = x8 + (ypart>>1) h->mb.i_b8_stride; + int o4 = 3x8 + ypart h->mb.i_b4_stride; + + if( b_interlaced && IS_INTRA( type_col[y8] ) ) + continue; + int idx; if( l1ref0[o8] == 0 ) idx = 0; @@ -332,13 +440,29 @@ return 1; } + +static int x264_mb_predict_mv_direct16x16_spatial_interlaced( x264_t h ) +{ + return x264_mb_predict_mv_direct16x16_spatial( h, 1 ); +} + +static int x264_mb_predict_mv_direct16x16_spatial_progressive( x264_t h ) +{ + return x264_mb_predict_mv_direct16x16_spatial( h, 0 ); +} + int x264_mb_predict_mv_direct16x16( x264_t h, int b_changed ) { int b_available; if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE ) return 0; else if( h->sh.b_direct_spatial_mv_pred ) - b_available = x264_mb_predict_mv_direct16x16_spatial( h ); + { + if( SLICE_MBAFF ) + b_available = x264_mb_predict_mv_direct16x16_spatial_interlaced( h ); + else + b_available = x264_mb_predict_mv_direct16x16_spatial_progressive( h ); + } else b_available = x264_mb_predict_mv_direct16x16_temporal( h ); @@ -426,7 +550,7 @@ } /* spatial predictors / - SET_MVP( mvr[h->mb.i_mb_left_xy] ); + SET_MVP( mvr[h->mb.i_mb_left_xy[0]] ); SET_MVP( mvr[h->mb.i_mb_top_xy] ); SET_MVP( mvr[h->mb.i_mb_topleft_xy] ); SET_MVP( mvr[h->mb.i_mb_topright_xy] ); @@ -438,13 +562,13 @@ x264_frame_t l0 = h->fref[0][0]; int field = h->mb.i_mb_y&1; int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; - int refpoc = h->fref[i_list][i_ref>>h->sh.b_mbaff]->i_poc; + int refpoc = h->fref[i_list][i_ref>>SLICE_MBAFF]->i_poc; refpoc += l0->i_delta_poc[field^(i_ref&1)]; #define SET_TMVP( dx, dy ) \ { \ int mb_index = h->mb.i_mb_xy + dx + dyh->mb.i_mb_stride; \ - int scale = (curpoc - refpoc) l0->inv_ref_poc[h->mb.b_interlaced&field]; \ + int scale = (curpoc - refpoc) * l0->inv_ref_poc[MB_INTERLACED&field]; \ mvc[i][0] = (l0->mv16x16[mb_index][0]scale + 128) >> 8; \ mvc[i][1] = (l0->mv16x16[mb_index][1]scale + 128) >> 8; \ i++; \
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/osdep.c ^
@@ -24,16 +24,16 @@ * For more information, contact us at licensing@x264.com. ****************************************************************************/ -#ifndef __MINGW32__ -#include <sys/time.h> -#else +#include "common.h" + +#if SYS_WINDOWS #include <sys/types.h> #include <sys/timeb.h> +#else +#include <sys/time.h> #endif #include <time.h> -#include "common.h" - #if PTW32_STATIC_LIB #define WIN32_LEAN_AND_MEAN #include <windows.h> @@ -43,14 +43,14 @@ int64_t x264_mdate( void ) { -#ifndef __MINGW32__ - struct timeval tv_date; - gettimeofday( &tv_date, NULL ); - return (int64_t)tv_date.tv_sec 1000000 + (int64_t)tv_date.tv_usec; -#else +#if SYS_WINDOWS struct timeb tb; ftime( &tb ); return ((int64_t)tb.time * 1000 + (int64_t)tb.millitm) * 1000; +#else + struct timeval tv_date; + gettimeofday( &tv_date, NULL ); + return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec; #endif } @@ -89,3 +89,35 @@ return 0; } #endif + +#ifdef __INTEL_COMPILER +/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of + * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30) + * adapted to x264's cpu schema. */ + +// Global variable indicating cpu +int __intel_cpu_indicator = 0; +// CPU dispatcher function +void __intel_cpu_indicator_init( void ) +{ + unsigned int cpu = x264_cpu_detect(); + if( cpu&X264_CPU_AVX ) + __intel_cpu_indicator = 0x20000; + else if( cpu&X264_CPU_SSE42 ) + __intel_cpu_indicator = 0x8000; + else if( cpu&X264_CPU_SSE4 ) + __intel_cpu_indicator = 0x2000; + else if( cpu&X264_CPU_SSSE3 ) + __intel_cpu_indicator = 0x1000; + else if( cpu&X264_CPU_SSE3 ) + __intel_cpu_indicator = 0x800; + else if( cpu&X264_CPU_SSE2 && !(cpu&X264_CPU_SSE2_IS_SLOW) ) + __intel_cpu_indicator = 0x200; + else if( cpu&X264_CPU_SSE ) + __intel_cpu_indicator = 0x80; + else if( cpu&X264_CPU_MMXEXT ) + __intel_cpu_indicator = 8; + else + __intel_cpu_indicator = 1; +} +#endif
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/osdep.h ^
@@ -50,6 +50,25 @@ #include <fcntl.h> // _O_BINARY #endif +#ifdef __ICL +#define inline __inline +#define strcasecmp _stricmp +#define strncasecmp _strnicmp +#define snprintf _snprintf +#define strtok_r strtok_s +#define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) +#endif + +#ifdef __INTEL_COMPILER +#include <mathimf.h> +#else +#include <math.h> +#endif + +#if (defined(__GNUC__) \|\| defined(__INTEL_COMPILER)) && (ARCH_X86 \|\| ARCH_X86_64) +#define HAVE_X86_INLINE_ASM 1 +#endif + #if !defined(isfinite) && (SYS_OPENBSD \|\| SYS_SunOS) #define isfinite finite #endif @@ -60,7 +79,11 @@ #endif #endif +#ifdef __ICL +#define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var +#else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) +#endif #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) #define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) #define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 ) @@ -99,9 +122,14 @@ #define x264_constant_p(x) __builtin_constant_p(x) #define x264_nonconstant_p(x) (!__builtin_constant_p(x)) #else -#define UNUSED +#ifdef __ICL +#define ALWAYS_INLINE __forceinline +#define NOINLINE __declspec(noinline) +#else #define ALWAYS_INLINE inline #define NOINLINE +#endif +#define UNUSED #define MAY_ALIAS #define x264_constant_p(x) 0 #define x264_nonconstant_p(x) 0 @@ -179,19 +207,13 @@ #define asm __asm__ -#if !defined(_WIN64) && !defined(__LP64__) -#if defined(__INTEL_COMPILER) -#define BROKEN_STACK_ALIGNMENT 1 /* define it if stack is not mod16 / -#endif -#endif - #if WORDS_BIGENDIAN #define endian_fix(x) (x) #define endian_fix64(x) (x) #define endian_fix32(x) (x) #define endian_fix16(x) (x) #else -#if defined(__GNUC__) && HAVE_MMX +#if HAVE_X86_INLINE_ASM && HAVE_MMX static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x ) { asm("bswap %0":"+r"(x)); @@ -209,7 +231,7 @@ return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24); } #endif -#if defined(__GNUC__) && ARCH_X86_64 +#if HAVE_X86_INLINE_ASM && ARCH_X86_64 static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x ) { asm("bswap %0":"+r"(x)); @@ -260,7 +282,7 @@ } #endif -#if defined(__GNUC__) && HAVE_MMX +#if HAVE_X86_INLINE_ASM && HAVE_MMX / Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of * using complex address modes properly unless we use inline asm. / static ALWAYS_INLINE void x264_prefetch( void p ) @@ -277,7 +299,7 @@ #endif #if HAVE_POSIXTHREAD -#if SYS_MINGW +#if SYS_WINDOWS #define x264_lower_thread_priority(p)\ {\ x264_pthread_t handle = pthread_self();\ @@ -290,7 +312,7 @@ #else #include <unistd.h> #define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); } -#endif /* SYS_MINGW / +#endif / SYS_WINDOWS */ #elif HAVE_WIN32THREAD #define x264_lower_thread_priority(p) SetThreadPriority( GetCurrentThread(), X264_MAX( -2, -p ) ) #else
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/pixel.c ^
@@ -641,6 +641,36 @@ return ssim; } +int pixel_vsad( pixel src, int stride, int height ) +{ + int score = 0; + for( int i = 1; i < height; i++, src += stride ) + for( int j = 0; j < 16; j++ ) + score += abs(src[j] - src[j+stride]); + return score; +} + +int x264_field_vsad( x264_t h, int mb_x, int mb_y ) +{ + int score_field, score_frame; + int stride = h->fenc->i_stride[0]; + int mb_stride = h->mb.i_mb_stride; + pixel fenc = h->fenc->plane[0] + 16 (mb_x + mb_y * stride); + int mb_xy = mb_x + mb_ymb_stride; + + / We don't want to analyze pixels outside the frame, as it gives inaccurate results. / + int mbpair_height = X264_MIN( h->param.i_height - mb_y 16, 32 ); + score_frame = h->pixf.vsad( fenc, stride, mbpair_height ); + score_field = h->pixf.vsad( fenc, stride2, mbpair_height >> 1 ); + score_field += h->pixf.vsad( fenc+stride, stride2, mbpair_height >> 1 ); + + if( mb_x > 0 ) + score_field += 512 - h->mb.field[mb_xy -1]1024; + if( mb_y > 0 ) + score_field += 512 - h->mb.field[mb_xy-mb_stride]1024; + + return (score_field < score_frame); +} /**************************************************************************** * successive elimination @@ -746,6 +776,7 @@ pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; pixf->var2_8x8 = pixel_var2_8x8; + pixf->vsad = pixel_vsad; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4; @@ -873,6 +904,7 @@ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext; pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext; + pixf->vsad = x264_pixel_vsad_mmxext; if( cpu&X264_CPU_CACHELINE_32 ) { @@ -921,6 +953,7 @@ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif pixf->var2_8x8 = x264_pixel_var2_8x8_sse2; + pixf->vsad = x264_pixel_vsad_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/pixel.h ^
@@ -47,10 +47,12 @@ PIXEL_2x2 = 9, }; -static const struct { +static const struct +{ int w; int h; -} x264_pixel_size[7] = { +} x264_pixel_size[7] = +{ { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, @@ -58,7 +60,8 @@ { 4, 4 } }; -static const uint8_t x264_size2pixel[5][5] = { +static const uint8_t x264_size2pixel[5][5] = +{ { 0, }, { 0, PIXEL_4x4, PIXEL_8x4, 0, 0 }, { 0, PIXEL_4x8, PIXEL_8x8, 0, PIXEL_16x8 }, @@ -79,6 +82,7 @@ x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp / + int (vsad)( pixel , int, int ); int (var2_8x8)( pixel , int, pixel , int, int * ); uint64_t (var[4])( pixel pix, int stride ); @@ -122,5 +126,6 @@ void x264_pixel_ssd_nv12( x264_pixel_function_t pf, pixel pix1, int i_pix1, pixel pix2, int i_pix2, int i_width, int i_height, uint64_t ssd_u, uint64_t ssd_v ); uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t pf, pixel pix1, int i_pix1, pixel pix2, int i_pix2, int i_width, int i_height ); float x264_pixel_ssim_wxh( x264_pixel_function_t pf, pixel pix1, int i_pix1, pixel pix2, int i_pix2, int i_width, int i_height, void buf ); +int x264_field_vsad( x264_t *h, int mb_x, int mb_y ); #endif
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/ppc/mc.c ^
@@ -856,6 +856,318 @@ dstc += dst_stride; } } + +static void mc_weight_w2_altivec( uint8_t dst, int i_dst, uint8_t src, int i_src, + const x264_weight_t weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + vec_u8_t srcv; + vec_s16_t weightv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 2, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, roundv ); + weightv = vec_sra( weightv, (vec_u16_t)denomv ); + weightv = vec_add( weightv, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t)dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 2, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t)dst ); + } + } +} +static void mc_weight_w4_altivec( uint8_t dst, int i_dst, uint8_t src, int i_src, + const x264_weight_t weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + vec_u8_t srcv; + vec_s16_t weightv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 4, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, roundv ); + weightv = vec_sra( weightv, (vec_u16_t)denomv ); + weightv = vec_add( weightv, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t)dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 4, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t)dst ); + } + } +} +static void mc_weight_w8_altivec( uint8_t dst, int i_dst, uint8_t src, int i_src, + const x264_weight_t weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + PREP_STORE8; + vec_u8_t srcv; + vec_s16_t weightv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 8, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, roundv ); + weightv = vec_sra( weightv, (vec_u16_t)denomv ); + weightv = vec_add( weightv, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + VEC_STORE8( srcv, dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 8, vec_u8_t, src ); + weightv = vec_u8_to_s16( srcv ); + + weightv = vec_mladd( weightv, scalev, offsetv ); + + srcv = vec_packsu( weightv, zero_s16v ); + VEC_STORE8( srcv, dst ); + } + } +} +static void mc_weight_w16_altivec( uint8_t dst, int i_dst, uint8_t src, int i_src, + const x264_weight_t weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD; + PREP_LOAD_SRC( src ); + vec_u8_t srcv; + vec_s16_t weight_lv, weight_hv; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 16, vec_u8_t, src ); + weight_hv = vec_u8_to_s16_h( srcv ); + weight_lv = vec_u8_to_s16_l( srcv ); + + weight_hv = vec_mladd( weight_hv, scalev, roundv ); + weight_lv = vec_mladd( weight_lv, scalev, roundv ); + weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv ); + weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv ); + weight_hv = vec_add( weight_hv, offsetv ); + weight_lv = vec_add( weight_lv, offsetv ); + + srcv = vec_packsu( weight_hv, weight_lv ); + vec_st( srcv, 0, dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + VEC_LOAD( src, srcv, 16, vec_u8_t, src ); + weight_hv = vec_u8_to_s16_h( srcv ); + weight_lv = vec_u8_to_s16_l( srcv ); + + weight_hv = vec_mladd( weight_hv, scalev, offsetv ); + weight_lv = vec_mladd( weight_lv, scalev, offsetv ); + + srcv = vec_packsu( weight_hv, weight_lv ); + vec_st( srcv, 0, dst ); + } + } +} +static void mc_weight_w20_altivec( uint8_t dst, int i_dst, uint8_t src, int i_src, + const x264_weight_t weight, int i_height ) +{ + LOAD_ZERO; + PREP_LOAD_SRC( src ); + vec_u8_t src_1v, src_2v, src_3v; + vec_s16_t weight_lv, weight_hv, weight_3v; + vec_s16_t scalev, offsetv, denomv, roundv; + vec_s16_u loadv; + + int denom = weight->i_denom; + + loadv.s[0] = weight->i_scale; + scalev = vec_splat( loadv.v, 0 ); + + loadv.s[0] = weight->i_offset; + offsetv = vec_splat( loadv.v, 0 ); + + if( denom >= 1 ) + { + loadv.s[0] = denom; + denomv = vec_splat( loadv.v, 0 ); + + loadv.s[0] = 1<<(denom - 1); + roundv = vec_splat( loadv.v, 0 ); + + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + src_1v = vec_ld( 0, src ); + src_2v = vec_ld( 16, src ); + src_3v = vec_ld( 19, src ); + src_1v = vec_perm( src_1v, src_2v, _src_ ); + src_3v = vec_perm( src_2v, src_3v, _src_ ); + weight_hv = vec_u8_to_s16_h( src_1v ); + weight_lv = vec_u8_to_s16_l( src_1v ); + weight_3v = vec_u8_to_s16_h( src_3v ); + + weight_hv = vec_mladd( weight_hv, scalev, roundv ); + weight_lv = vec_mladd( weight_lv, scalev, roundv ); + weight_3v = vec_mladd( weight_3v, scalev, roundv ); + weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv ); + weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv ); + weight_3v = vec_sra( weight_3v, (vec_u16_t)denomv ); + weight_hv = vec_add( weight_hv, offsetv ); + weight_lv = vec_add( weight_lv, offsetv ); + weight_3v = vec_add( weight_3v, offsetv ); + + src_1v = vec_packsu( weight_hv, weight_lv ); + src_3v = vec_packsu( weight_3v, zero_s16v ); + vec_st( src_1v, 0, dst ); + vec_ste( (vec_u32_t)src_3v, 16, (uint32_t)dst ); + } + } + else + { + for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) + { + src_1v = vec_ld( 0, src ); + src_2v = vec_ld( 16, src ); + src_3v = vec_ld( 19, src ); + src_1v = vec_perm( src_1v, src_2v, _src_ ); + src_3v = vec_perm( src_2v, src_3v, _src_ ); + weight_hv = vec_u8_to_s16_h( src_1v ); + weight_lv = vec_u8_to_s16_l( src_1v ); + weight_3v = vec_u8_to_s16_h( src_3v ); + + weight_hv = vec_mladd( weight_hv, scalev, offsetv ); + weight_lv = vec_mladd( weight_lv, scalev, offsetv ); + weight_3v = vec_mladd( weight_3v, scalev, offsetv ); + + src_1v = vec_packsu( weight_hv, weight_lv ); + src_3v = vec_packsu( weight_3v, zero_s16v ); + vec_st( src_1v, 0, dst ); + vec_ste( (vec_u32_t)src_3v, 16, (uint32_t)dst ); + } + } +} + +static weight_fn_t x264_mc_weight_wtab_altivec[6] = +{ + mc_weight_w2_altivec, + mc_weight_w4_altivec, + mc_weight_w8_altivec, + mc_weight_w16_altivec, + mc_weight_w16_altivec, + mc_weight_w20_altivec, +}; + #endif // !HIGH_BIT_DEPTH void x264_mc_altivec_init( x264_mc_functions_t pf ) @@ -870,5 +1182,7 @@ pf->hpel_filter = x264_hpel_filter_altivec; pf->frame_init_lowres_core = frame_init_lowres_core_altivec; + + pf->weight = x264_mc_weight_wtab_altivec; #endif // !HIGH_BIT_DEPTH }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/ppc/pixel.c ^
@@ -26,6 +26,7 @@ #include "common/common.h" #include "ppccommon.h" +#include "../predict.h" #if !HIGH_BIT_DEPTH /*********************************************************************** @@ -1983,6 +1984,61 @@ sums[0][3] = temp[0]; sums[1][3] = temp[1]; } + +#define SATD_X( size ) \ +static void pixel_satd_x3_##size##_altivec( uint8_t fenc, uint8_t pix0, uint8_t pix1, uint8_t pix2, int i_stride, int scores[3] )\ +{\ + scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\ +}\ +static void pixel_satd_x4_##size##_altivec( uint8_t fenc, uint8_t pix0, uint8_t pix1, uint8_t pix2, uint8_t pix3, int i_stride, int scores[4] )\ +{\ + scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\ + scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\ +} +SATD_X( 16x16 )\ +SATD_X( 16x8 )\ +SATD_X( 8x16 )\ +SATD_X( 8x8 )\ +SATD_X( 8x4 )\ +SATD_X( 4x8 )\ +SATD_X( 4x4 ) + + +#define INTRA_MBCMP_8x8( mbcmp )\ +void intra_##mbcmp##_x3_8x8_altivec( uint8_t fenc, uint8_t edge[33], int res[3] )\ +{\ + ALIGNED_8( uint8_t pix[8FDEC_STRIDE] );\ + x264_predict_8x8_v_c( pix, edge );\ + res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_8x8_h_c( pix, edge );\ + res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_8x8_dc_c( pix, edge );\ + res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ +} + +INTRA_MBCMP_8x8(sad) +INTRA_MBCMP_8x8(sa8d) + +#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\ +void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t fenc, uint8_t fdec, int res[3] )\ +{\ + x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\ + res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\ + res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ + x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\ + res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ +} + +INTRA_MBCMP(satd, 4, v, h, dc, ) +INTRA_MBCMP(sad, 8, dc, h, v, c ) +INTRA_MBCMP(satd, 8, dc, h, v, c ) +INTRA_MBCMP(sad, 16, v, h, dc, ) +INTRA_MBCMP(satd, 16, v, h, dc, ) #endif // !HIGH_BIT_DEPTH /*************************************************************************** @@ -2014,12 +2070,38 @@ pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec; pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec; + pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec; + pixf->satd_x3[PIXEL_8x16] = pixel_satd_x3_8x16_altivec; + pixf->satd_x3[PIXEL_16x8] = pixel_satd_x3_16x8_altivec; + pixf->satd_x3[PIXEL_8x8] = pixel_satd_x3_8x8_altivec; + pixf->satd_x3[PIXEL_8x4] = pixel_satd_x3_8x4_altivec; + pixf->satd_x3[PIXEL_4x8] = pixel_satd_x3_4x8_altivec; + pixf->satd_x3[PIXEL_4x4] = pixel_satd_x3_4x4_altivec; + + pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec; + pixf->satd_x4[PIXEL_8x16] = pixel_satd_x4_8x16_altivec; + pixf->satd_x4[PIXEL_16x8] = pixel_satd_x4_16x8_altivec; + pixf->satd_x4[PIXEL_8x8] = pixel_satd_x4_8x8_altivec; + pixf->satd_x4[PIXEL_8x4] = pixel_satd_x4_8x4_altivec; + pixf->satd_x4[PIXEL_4x8] = pixel_satd_x4_4x8_altivec; + pixf->satd_x4[PIXEL_4x4] = pixel_satd_x4_4x4_altivec; + + pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_altivec; + pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_altivec; + pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_altivec; + + pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_altivec; + pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_altivec; + pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec; + pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec; pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec; pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec; pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec; + pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_altivec; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_altivec; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_altivec;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/quant.c ^
@@ -141,6 +141,66 @@ } } +static ALWAYS_INLINE void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf ) +{ + int d0 = dct[0] + dct[1]; + int d1 = dct[2] + dct[3]; + int d2 = dct[0] - dct[1]; + int d3 = dct[2] - dct[3]; + out[0] = (d0 + d1) * dequant_mf >> 5; + out[1] = (d0 - d1) * dequant_mf >> 5; + out[2] = (d2 + d3) * dequant_mf >> 5; + out[3] = (d2 - d3) * dequant_mf >> 5; +} + +static ALWAYS_INLINE int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf ) +{ + dctcoef out[4]; + idct_dequant_2x2_dconly( out, dct, dequant_mf ); + return ((ref[0] ^ (out[0]+32)) + \| (ref[1] ^ (out[1]+32)) + \| (ref[2] ^ (out[2]+32)) + \| (ref[3] ^ (out[3]+32))) >> 6; +} + +static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf ) +{ + /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 3264 / + dctcoef dct_orig[4]; + int coeff, nz; + + idct_dequant_2x2_dconly( dct_orig, dct, dequant_mf ); + dct_orig[0] += 32; + dct_orig[1] += 32; + dct_orig[2] += 32; + dct_orig[3] += 32; + + /* If the DC coefficients already round to zero, terminate early. / + if( !((dct_orig[0]\|dct_orig[1]\|dct_orig[2]\|dct_orig[3])>>6) ) + return 0; + + / Start with the highest frequency coefficient... is this the best option? / + for( nz = 0, coeff = 3; coeff >= 0; coeff-- ) + { + int level = dct[coeff]; + int sign = level>>31 \| 1; / dct2x2[coeff] < 0 ? -1 : 1 / + + while( level ) + { + dct[coeff] = level - sign; + if( idct_dequant_round_2x2_dc( dct_orig, dct, dequant_mf ) ) + { + nz = 1; + dct[coeff] = level; + break; + } + level -= sign; + } + } + + return nz; +} + static void x264_denoise_dct( dctcoef dct, uint32_t sum, udctcoef offset, int size ) { for( int i = 0; i < size; i++ ) @@ -272,6 +332,8 @@ pf->dequant_4x4_dc = dequant_4x4_dc; pf->dequant_8x8 = dequant_8x8; + pf->optimize_chroma_dc = optimize_chroma_dc; + pf->denoise_dct = x264_denoise_dct; pf->decimate_score15 = x264_decimate_score15; pf->decimate_score16 = x264_decimate_score16; @@ -427,6 +489,7 @@ pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } + pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; @@ -457,6 +520,7 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; + pf->optimize_chroma_dc = x264_optimize_chroma_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; @@ -473,6 +537,7 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; pf->quant_4x4 = x264_quant_4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; + pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse4; } if( cpu&X264_CPU_AVX ) @@ -480,6 +545,7 @@ pf->dequant_4x4 = x264_dequant_4x4_avx; pf->dequant_8x8 = x264_dequant_8x8_avx; pf->dequant_4x4_dc = x264_dequant_4x4dc_avx; + pf->optimize_chroma_dc = x264_optimize_chroma_dc_avx; pf->denoise_dct = x264_denoise_dct_avx; } #endif // HAVE_MMX
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/quant.h ^
@@ -38,6 +38,8 @@ void (dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void (dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); + int (optimize_chroma_dc)( dctcoef dct[4], int dequant_mf ); + void (denoise_dct)( dctcoef dct, uint32_t sum, udctcoef offset, int size ); int (decimate_score15)( dctcoef *dct );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/rectangle.h ^
@@ -80,6 +80,15 @@ { /* height 1, width 16 doesn't occur / assert( h != 1 ); +#if HAVE_VECTOREXT && defined(__SSE__) + v4si v16 = {v,v,v,v}; + + M128( d+s0+0 ) = (__m128)v16; + M128( d+s1+0 ) = (__m128)v16; + if( h == 2 ) return; + M128( d+s2+0 ) = (__m128)v16; + M128( d+s*3+0 ) = (__m128)v16; +#else if( WORD_SIZE == 8 ) { do @@ -103,6 +112,7 @@ d += s; } while( --h ); } +#endif } else assert(0);
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/set.c ^
@@ -24,7 +24,6 @@ ****************************************************************************/ #define _ISOC99_SOURCE -#include <math.h> #include "common.h" #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s)) @@ -196,7 +195,7 @@ int dct8x8 = cat == 1; int size = dct8x8 ? 64 : 16; udctcoef nr_offset = h->nr_offset_emergency[q][cat]; - /* Denoise chroma first (due to h264's chroma QP offset, then luma, then DC. / + / Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. / int dc_threshold = (QP_MAX-QP_MAX_SPEC)2/3; int luma_threshold = (QP_MAX-QP_MAX_SPEC)2/3; int chroma_threshold = 0; @@ -237,6 +236,10 @@ h->param.rc.i_qp_max = min_qp_err-1; if( max_qp_err >= h->param.rc.i_qp_min ) h->param.rc.i_qp_min = max_qp_err+1; + / If long level-codes aren't allowed, we need to allow QP high enough to avoid them. */ + if( !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH ) + while( h->chroma_qp_table[SPEC_QP(h->param.rc.i_qp_max)] <= 12 \|\| h->param.rc.i_qp_max <= 12 ) + h->param.rc.i_qp_max++; if( h->param.rc.i_qp_min > h->param.rc.i_qp_max ) { x264_log( h, X264_LOG_ERROR, "Impossible QP constraints for CQM (min=%d, max=%d)\n", h->param.rc.i_qp_min, h->param.rc.i_qp_max );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/vlc.c ^
@@ -695,7 +695,7 @@ vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; -void x264_init_vlc_tables( void ) +void x264_cavlc_init( void ) { for( int i_suffix = 0; i_suffix < 7; i_suffix++ ) for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/win32thread.c ^
@@ -59,7 +59,7 @@ static x264_win32thread_control_t thread_control; /* _beginthreadex requires that the start routine is __stdcall / -static __stdcall unsigned x264_win32thread_worker( void arg ) +static unsigned __stdcall x264_win32thread_worker( void arg ) { x264_pthread_t h = arg; h->ret = h->func( h->arg );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/cabac-a.asm ^
@@ -35,13 +35,13 @@ ; t3 must be ecx, since it's used for shift. %ifdef WIN64 - DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2 + DECLARE_REG_TMP 3,1,2,0,4,5,6,2 %define pointer resq %elifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6 + DECLARE_REG_TMP 0,1,2,3,4,5,6,6 %define pointer resq %else - DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2 + DECLARE_REG_TMP 0,4,2,1,3,5,6,2 %define pointer resd %endif @@ -75,21 +75,21 @@ movifnidn t0, r0mp movifnidn t1d, r1m mov t5d, [t0+cb.range] - movzx t4d, byte [t0+cb.state+t1] + movzx t6d, byte [t0+cb.state+t1] + mov t4d, ~1 mov t3d, t5d - mov t6d, t4d + and t4d, t6d shr t5d, 6 - shr t4d, 1 movifnidn t2d, r2m - LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t44 + LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t42 LOAD_GLOBAL t4d, cabac_transition, t2, t62 and t6d, 1 sub t3d, t5d cmp t6d, t2d mov t6d, [t0+cb.low] - lea t7, [t6+t3] + lea t2, [t6+t3] cmovne t3d, t5d - cmovne t6d, t7d + cmovne t6d, t2d mov [t0+cb.state+t1], t4b ;cabac_encode_renorm mov t4d, t3d @@ -108,10 +108,9 @@ cglobal cabac_encode_bypass_asm, 0,3 movifnidn t0, r0mp movifnidn t3d, r1m - neg t3d - mov t8d, [t0+cb.low] + mov t7d, [t0+cb.low] and t3d, [t0+cb.range] - lea t8d, [t82+t3] + lea t7d, [t7*2+t3] mov t3d, [t0+cb.queue] inc t3d %ifdef UNIX64 ; .putbyte compiles to nothing but a jmp @@ -119,12 +118,12 @@ %else jge .putbyte %endif - mov [t0+cb.low], t8d + mov [t0+cb.low], t7d mov [t0+cb.queue], t3d RET .putbyte: PROLOGUE 0,7 - movifnidn t6d, t8d + movifnidn t6d, t7d jmp cabac_putbyte cglobal cabac_encode_terminal_asm, 0,3 @@ -163,7 +162,7 @@ mov t5d, [t0+cb.bytes_outstanding] cmp t2b, 0xff ; FIXME is a 32bit op faster? jz .postpone - mov t1, [t0+cb.p] + mov t1, [t0+cb.p] add [t1-1], dh ; t2h dec dh .loop_outstanding:
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/const-a.asm ^
@@ -51,6 +51,7 @@ const pd_1, times 4 dd 1 const pd_32, times 4 dd 32 +const pd_1024, times 4 dd 1024 const pd_ffff, times 4 dd 0xffff const pw_00ff, times 8 dw 0x00ff const pw_ff00, times 8 dw 0xff00
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/deblock-a.asm ^
@@ -1963,7 +1963,7 @@ %define ref r1+scan8start %define mv r2+scan8start4 %define bs0 r3 -%define bs1 r3+16 +%define bs1 r3+32 %macro LOAD_BYTES_MMX 1 movd m2, [%1+80-1]
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/mc-a2.asm ^
@@ -40,7 +40,7 @@ pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff -pf_inv256: times 4 dd 0.00390625 +pf_inv256: times 8 dd 0.00390625 pad10: times 8 dw 10PIXEL_MAX pad20: times 8 dw 20PIXEL_MAX @@ -1128,7 +1128,7 @@ %endif ; These functions are not general-use; not only do the SSE ones require aligned input, -; but they also will fail if given a non-mod16 size or a size less than 64. +; but they also will fail if given a non-mod16 size. ; memzero SSE will fail for non-mod128. ;----------------------------------------------------------------------------- @@ -1136,12 +1136,15 @@ ;----------------------------------------------------------------------------- cglobal memcpy_aligned_mmx, 3,3 test r2d, 16 - jz .copy32 + jz .copy32start sub r2d, 16 movq mm0, [r1 + r2 + 0] movq mm1, [r1 + r2 + 8] movq [r0 + r2 + 0], mm0 movq [r0 + r2 + 8], mm1 +.copy32start + test r2d, r2d + jz .ret .copy32: sub r2d, 32 movq mm0, [r1 + r2 + 0] @@ -1153,6 +1156,7 @@ movq [r0 + r2 + 16], mm2 movq [r0 + r2 + 24], mm3 jg .copy32 +.ret REP_RET ;----------------------------------------------------------------------------- @@ -1166,12 +1170,15 @@ movdqa [r0 + r2], xmm0 .copy32: test r2d, 32 - jz .copy64 + jz .copy64start sub r2d, 32 movdqa xmm0, [r1 + r2 + 0] movdqa [r0 + r2 + 0], xmm0 movdqa xmm1, [r1 + r2 + 16] movdqa [r0 + r2 + 16], xmm1 +.copy64start + test r2d, r2d + jz .ret .copy64: sub r2d, 64 movdqa xmm0, [r1 + r2 + 0] @@ -1183,6 +1190,7 @@ movdqa xmm3, [r1 + r2 + 48] movdqa [r0 + r2 + 48], xmm3 jg .copy64 +.ret: REP_RET ;----------------------------------------------------------------------------- @@ -1622,7 +1630,7 @@ ; uint16_t inter_costs, uint16_t inv_qscales, float fps_factor, int len ) ;----------------------------------------------------------------------------- cglobal mbtree_propagate_cost_sse2, 7,7,7 - shl r6d, 1 + add r6d, r6d lea r0, [r0+r62] add r1, r6 add r2, r6 @@ -1665,3 +1673,49 @@ jl .loop REP_RET +%macro INT16_TO_FLOAT 1 + vpunpckhwd xmm4, xmm%1, xmm7 + vpunpcklwd xmm%1, xmm7 + vinsertf128 ymm%1, ymm%1, xmm4, 1 + vcvtdq2ps ymm%1, ymm%1 +%endmacro + +; FIXME: align loads/stores to 16 bytes +cglobal mbtree_propagate_cost_avx, 7,7,8 + add r6d, r6d + lea r0, [r0+r62] + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + neg r6 + vmovdqa xmm5, [pw_3fff] + vbroadcastss ymm6, [r5] + vmulps ymm6, ymm6, [pf_inv256] + vpxor xmm7, xmm7 +.loop: + vmovdqu xmm0, [r2+r6] ; intra + vmovdqu xmm1, [r4+r6] ; invq + vmovdqu xmm2, [r1+r6] ; prop + vpand xmm3, xmm5, [r3+r6] ; inter + INT16_TO_FLOAT 0 + INT16_TO_FLOAT 1 + INT16_TO_FLOAT 2 + INT16_TO_FLOAT 3 + vmulps ymm1, ymm1, ymm0 + vsubps ymm4, ymm0, ymm3 + vmulps ymm1, ymm1, ymm6 ; intrainvqfps_factor>>8 + vaddps ymm1, ymm1, ymm2 ; prop + (intrainvqfps_factor>>8) + vrcpps ymm3, ymm0 ; 1 / intra 1st approximation + vmulps ymm2, ymm0, ymm3 ; intra (1/intra 1st approx) + vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2 + vmulps ymm1, ymm1, ymm4 ; (prop + (intrainvqfps_factor>>8)) * (intra - inter) + vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx) + vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra + vmulps ymm1, ymm1, ymm3 ; / intra + vcvtps2dq ymm1, ymm1 + vmovdqu [r0+r6*2], ymm1 + add r6, 16 + jl .loop + vzeroupper + RET
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/mc-c.c ^
@@ -140,6 +140,8 @@ void x264_integral_init4v_ssse3( uint16_t sum8, uint16_t sum4, int stride ); void x264_mbtree_propagate_cost_sse2( int dst, uint16_t propagate_in, uint16_t intra_costs, uint16_t inter_costs, uint16_t inv_qscales, float fps_factor, int len ); +void x264_mbtree_propagate_cost_avx( int dst, uint16_t propagate_in, uint16_t intra_costs, + uint16_t inter_costs, uint16_t inv_qscales, float fps_factor, int len ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel dstu, pixel dstv, int i_dst,\ @@ -728,4 +730,8 @@ if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; #endif // HIGH_BIT_DEPTH + + if( !(cpu&X264_CPU_AVX) ) + return; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/pixel.h ^
@@ -138,6 +138,8 @@ int x264_pixel_var2_8x8_mmxext( pixel , int, pixel , int, int * ); int x264_pixel_var2_8x8_sse2( pixel , int, pixel , int, int * ); int x264_pixel_var2_8x8_ssse3( uint8_t , int, uint8_t , int, int * ); +int x264_pixel_vsad_mmxext( pixel src, int stride, int height ); +int x264_pixel_vsad_sse2( pixel src, int stride, int height ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/predict-c.c ^
@@ -180,7 +180,7 @@ PREDICT_16x16_P( avx ) #endif //!HIGH_BIT_DEPTH -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH static void x264_predict_16x16_p_sse2( uint16_t src ) #else @@ -191,10 +191,10 @@ int H, V; #if HIGH_BIT_DEPTH asm ( - "movdqu -2+%1, %%xmm1 \n" - "movdqa 16+%1, %%xmm0 \n" - "pmaddwd %2, %%xmm0 \n" - "pmaddwd %3, %%xmm1 \n" + "movdqu %1, %%xmm1 \n" + "movdqa %2, %%xmm0 \n" + "pmaddwd %3, %%xmm0 \n" + "pmaddwd %4, %%xmm1 \n" "paddd %%xmm1, %%xmm0 \n" "movhlps %%xmm0, %%xmm1 \n" "paddd %%xmm1, %%xmm0 \n" @@ -202,24 +202,26 @@ "paddd %%xmm1, %%xmm0 \n" "movd %%xmm0, %0 \n" :"=r"(H) - :"m"(src[-FDEC_STRIDE]), "m"(pw_12345678), "m"(pw_m87654321) + :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]), + "m"(pw_12345678), "m"(pw_m87654321) ); #else asm ( "movq %1, %%mm1 \n" - "movq 8+%1, %%mm0 \n" - "palignr $7, -8+%1, %%mm1 \n" - "pmaddubsw %2, %%mm0 \n" - "pmaddubsw %3, %%mm1 \n" + "movq %2, %%mm0 \n" + "palignr $7, %3, %%mm1 \n" + "pmaddubsw %4, %%mm0 \n" + "pmaddubsw %5, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "pshufw $14, %%mm0, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "pshufw $1, %%mm0, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "movd %%mm0, %0 \n" - "movsx %w0, %0 \n" + "movswl %w0, %0 \n" :"=r"(H) - :"m"(src[-FDEC_STRIDE]), "m"(pb_12345678), "m"(pb_m87654321) + :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]), + "m"(src[-FDEC_STRIDE-8]), "m"(pb_12345678), "m"(pb_m87654321) ); #endif V = 8 ( src[15FDEC_STRIDE-1] - src[-1FDEC_STRIDE-1] ) @@ -269,7 +271,7 @@ #endif //!HIGH_BIT_DEPTH -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH static void x264_predict_8x8c_p_sse2( uint16_t src ) #else @@ -299,7 +301,7 @@ "pshufw $1, %%mm0, %%mm1 \n" "paddw %%mm1, %%mm0 \n" "movd %%mm0, %0 \n" - "movsx %w0, %0 \n" + "movswl %w0, %0 \n" :"=r"(H) :"m"(src[-FDEC_STRIDE]), "m"(pb_m32101234) ); @@ -430,7 +432,9 @@ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2; pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2; pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2; +#if HAVE_X86_INLINE_ASM pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2; +#endif #else #if !ARCH_X86_64 pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext; @@ -447,7 +451,7 @@ if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3; -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3; #endif if( !(cpu&X264_CPU_AVX) ) @@ -471,7 +475,9 @@ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2; +#if HAVE_X86_INLINE_ASM pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2; +#endif #else #if ARCH_X86_64 pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left; @@ -491,7 +497,7 @@ if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3; -#ifdef __GNUC__ +#if HAVE_X86_INLINE_ASM pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3; #endif #endif // HIGH_BIT_DEPTH
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/quant-a.asm ^
@@ -7,6 +7,7 @@ ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* Christian Heine <sennindemokrit@gmx.net> ;* Oskar Arvidsson <oskar@irock.se> +;* Henrik Gramner <hengar-6@student.ltu.se> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -69,12 +70,18 @@ db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24 +chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1 +chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0 +chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1 +chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1 + SECTION .text cextern pb_1 cextern pw_1 cextern pd_1 cextern pb_01 +cextern pd_1024 %macro QUANT_DC_START_MMX 0 movd m6, r1m ; mf @@ -117,12 +124,18 @@ psignw %1, %2 %endmacro -%macro PSIGND_MMX 2 +%macro PSIGND_MMX 2-3 +%if %0==3 + mova %1, %2 + pxor %1, %3 + psubd %1, %3 +%else pxor %1, %2 psubd %1, %2 +%endif %endmacro -%macro PSIGND_SSSE3 2 +%macro PSIGND_SSSE3 2+ psignd %1, %2 %endmacro @@ -747,6 +760,126 @@ DEQUANT_DC avx , w %endif +; t4 is eax for return value. +%ifdef ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and NIX +%else + DECLARE_REG_TMP 4,1,2,3,0,5 +%endif + +;----------------------------------------------------------------------------- +; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf ) +;----------------------------------------------------------------------------- + +; %2 == 1 for sse2 or ssse3, 0 for sse4/avx +%macro OPTIMIZE_CHROMA_DC 2 +%assign %%regs 4+%2 +%ifndef ARCH_X86_64 + %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64 +%endif +cglobal optimize_chroma_dc_%1, 0,%%regs,7 + movifnidn t0, r0mp + movd m2, r1m + movq m1, [t0] +%if %2 + pxor m4, m4 +%else ; sse4, avx + pcmpeqb m4, m4 + pslld m4, 11 +%endif +%ifidn %1, sse2 + mova m3, [chroma_dc_dct_mask_mmx] + mova m5, [chroma_dc_dmf_mask_mmx] +%else + mova m3, [chroma_dc_dct_mask] + mova m5, [chroma_dc_dmf_mask] +%endif + pshuflw m2, m2, 0 + pshufd m0, m1, 00010001b ; 1 0 3 2 1 0 3 2 + punpcklqdq m2, m2 + punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0 + mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left + PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2 + PSIGNW m2, m5 ; + - - + - - + + + paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2 + pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 dmf + punpcklwd m1, m1 + psrad m2, 16 ; + - - + + mov t1d, 3 + paddd m0, m6 + xor t4d, t4d +%ifidn %1, sse2 + psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly +%endif +%if %2 + mova m6, m0 + SWAP 0, 6 + psrad m6, 11 + pcmpeqd m6, m4 + pmovmskb t5d, m6 + cmp t5d, 0xffff +%else ; sse4, avx + ptest m0, m4 +%endif + jz .ret ; if the DC coefficients already round to zero, terminate early + mova m3, m0 +.outer_loop: + movsx t3d, word [t0+2t1] ; dct[coeff] + pshufd m6, m1, 11111111b + pshufd m1, m1, 10010000b ; move the next element to high dword + PSIGND m5, m2, m6 + test t3d, t3d + jz .loop_end +.outer_loop_0: + mov t2d, t3d + sar t3d, 31 + or t3d, 1 +.inner_loop: + psubd m3, m5 ; coeff -= sign + pxor m6, m0, m3 +%if %2 + psrad m6, 11 + pcmpeqd m6, m4 + pmovmskb t5d, m6 + cmp t5d, 0xffff +%else ; sse4, avx + ptest m6, m4 +%endif + jz .round_coeff + paddd m3, m5 ; coeff += sign + mov t4d, 1 +.loop_end: + dec t1d + jz .last_coeff + pshufd m2, m2, 01111000b ; - + - + / - - + + + jg .outer_loop +.ret: + REP_RET +.round_coeff: + sub t2d, t3d + mov [t0+2t1], t2w + jnz .inner_loop + jmp .loop_end +.last_coeff: + movsx t3d, word [t0] + punpcklqdq m2, m2 ; + + + + + PSIGND m5, m2, m1 + test t3d, t3d + jnz .outer_loop_0 + REP_RET +%endmacro + +INIT_XMM +%define PSIGNW PSIGNW_MMX +%define PSIGND PSIGND_MMX +OPTIMIZE_CHROMA_DC sse2, 1 +%define PSIGNW PSIGNW_SSSE3 +%define PSIGND PSIGND_SSSE3 +OPTIMIZE_CHROMA_DC ssse3, 1 +OPTIMIZE_CHROMA_DC sse4, 0 +INIT_AVX +OPTIMIZE_CHROMA_DC avx, 0 + %ifdef HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void denoise_dct( int32_t dct, uint32_t sum, uint32_t *offset, int size )
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/quant.h ^
@@ -57,6 +57,10 @@ void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +int x264_optimize_chroma_dc_sse2( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_dc_ssse3( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_dc_sse4( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_dc_avx( dctcoef dct[4], int dequant_mf ); void x264_denoise_dct_mmx ( dctcoef dct, uint32_t sum, udctcoef offset, int size ); void x264_denoise_dct_sse2 ( dctcoef dct, uint32_t sum, udctcoef offset, int size ); void x264_denoise_dct_ssse3( dctcoef dct, uint32_t sum, udctcoef *offset, int size );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/sad-a.asm ^
@@ -273,6 +273,71 @@ RET ;----------------------------------------------------------------------------- +; void pixel_vsad( pixel src, int stride ); +;----------------------------------------------------------------------------- + +%ifndef ARCH_X86_64 +INIT_MMX +cglobal pixel_vsad_mmxext, 3,3 + mova m0, [r0] + mova m1, [r0+8] + mova m2, [r0+r1] + mova m3, [r0+r1+8] + lea r0, [r0+r12] + psadbw m0, m2 + psadbw m1, m3 + paddw m0, m1 + sub r2d, 2 + je .end +.loop: + mova m4, [r0] + mova m5, [r0+8] + mova m6, [r0+r1] + mova m7, [r0+r1+8] + lea r0, [r0+r12] + psadbw m2, m4 + psadbw m3, m5 + psadbw m4, m6 + psadbw m5, m7 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + mova m2, m6 + mova m3, m7 + sub r2d, 2 + jg .loop +.end: + movd eax, m0 + RET +%endif + +INIT_XMM +cglobal pixel_vsad_sse2, 3,3 + mova m0, [r0] + mova m1, [r0+r1] + lea r0, [r0+r12] + psadbw m0, m1 + sub r2d, 2 + je .end +.loop: + mova m2, [r0] + mova m3, [r0+r1] + lea r0, [r0+r12] + psadbw m1, m2 + psadbw m2, m3 + paddw m0, m1 + paddw m0, m2 + mova m1, m3 + sub r2d, 2 + jg .loop +.end: + movhlps m1, m0 + paddw m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- ; void intra_sad_x3_4x4( uint8_t fenc, uint8_t *fdec, int res[3] ); ;-----------------------------------------------------------------------------
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/common/x86/util.h ^
@@ -27,11 +27,19 @@ #ifndef X264_X86_UTIL_H #define X264_X86_UTIL_H -#ifdef __GNUC__ - #ifdef __SSE__ #include <xmmintrin.h> + +#undef M128_ZERO +#define M128_ZERO ((__m128){0,0,0,0}) +#define x264_union128_t x264_union128_sse_t +typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t; +#if HAVE_VECTOREXT +typedef uint32_t v4si __attribute__((vector_size (16))); #endif +#endif // __SSE__ + +#if HAVE_X86_INLINE_ASM && HAVE_MMX #define x264_median_mv x264_median_mv_mmxext static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t dst, int16_t a, int16_t b, int16_t c ) @@ -92,11 +100,13 @@ { static const uint64_t pb_2 = 0x0202020202020202ULL; static const uint64_t pb_32 = 0x2020202020202020ULL; + static const uint64_t pb_33 = 0x2121212121212121ULL; int amvd; asm( "movd %1, %%mm0 \n" "movd %2, %%mm1 \n" - "paddb %%mm1, %%mm0 \n" + "paddusb %%mm1, %%mm0 \n" + "pminub %5, %%mm0 \n" "pxor %%mm2, %%mm2 \n" "movq %%mm0, %%mm1 \n" "pcmpgtb %3, %%mm0 \n" @@ -106,7 +116,7 @@ "movd %%mm2, %0 \n" :"=r"(amvd) :"m"(M16( mvdleft )),"m"(M16( mvdtop )), - "m"(pb_2),"m"(pb_32) + "m"(pb_2),"m"(pb_32),"m"(pb_33) ); return amvd; } @@ -149,13 +159,6 @@ ); } -#ifdef __SSE__ -#undef M128_ZERO -#define M128_ZERO ((__m128){0,0,0,0}) -#define x264_union128_t x264_union128_sse_t -typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t; -#endif - #endif #endif
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/configure ^
@@ -7,6 +7,10 @@ available options: --help print this message + --disable-cli disables cli + --system-libx264 use system libx264 instead of internal + --enable-shared build shared library + --enable-static build static library --disable-avs disables avisynth support (windows only) --disable-lavf disables libavformat support --disable-ffms disables ffmpegsource support @@ -16,11 +20,12 @@ --enable-win32thread use win32threads (windows only) --disable-swscale disables swscale support --disable-asm disables platform-specific assembly optimizations - --enable-debug adds -g, doesn't strip - --enable-gprof adds -pg, doesn't strip + --disable-interlaced disables interlaced encoding support + --enable-debug adds -g + --enable-gprof adds -pg + --enable-strip adds -s --enable-visualize enables visualization (X11 only) --enable-pic build position-independent code - --enable-shared build shared library --bit-depth=BIT_DEPTH sets output bit depth (8-10), default 8 --extra-asflags=EASFLAGS add EASFLAGS to ASFLAGS --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS @@ -49,6 +54,45 @@ echo "$1" >> config.log } +intel_cflags() { + # Intel Compiler issues an incredibly large number of warnings on any warning level, + # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them + for arg in $; do + [ $arg = -ffast-math ] && arg= + [[ "$arg" = -falign-loops ]] && arg= + [ "$arg" = -fno-tree-vectorize ] && arg= + [ "$arg" = -Wshadow ] && arg= + if [ $compiler = ICL ]; then + [ "$arg" = -Wall ] && arg=-W0 + [ "$arg" = -g ] && arg=-Z7 + [ "$arg" = -fomit-frame-pointer ] && arg= + [ "$arg" = -s ] && arg= + [ "$arg" = -fPIC ] && arg= + else + [ "$arg" = -Wall ] && arg=-w0 + fi + + [ -n "$arg" ] && echo -n "$arg " + done +} + +icl_ldflags() { + for arg in $; do + arg=${arg/LIBPATH/libpath} + [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib + [ ${arg#-L} != $arg ] && arg=-libpath:${arg#-L} + [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware + [ $arg = -s ] && arg= + [ "$arg" = -Wl,-Bsymbolic ] && arg= + + arg=${arg/pthreadGC/pthreadVC} + [ "$arg" = avifil32.lib ] && arg=vfw32.lib + [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib + + [ -n "$arg" ] && echo -n "$arg " + done +} + cc_check() { if [ -z "$3" ]; then if [ -z "$1$2" ]; then @@ -59,14 +103,23 @@ log_check "for $1" fi elif [ -z "$1" ]; then - log_check "whether $CC supports $3" + if [ -z "$2" ]; then + log_check "whether $CC supports $3" + else + log_check "whether $CC supports $3 with $2" + fi else log_check "for $3 in $1"; fi rm -f conftest.c [ -n "$1" ] && echo "#include <$1>" > conftest.c echo "int main () { $3 return 0; }" >> conftest.c - if $CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest >conftest.log 2>&1; then + if [ $compiler = ICL ]; then + cc_cmd="$CC conftest.c $CFLAGS $2 -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" + else + cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" + fi + if $cc_cmd >conftest.log 2>&1; then res=$? log_ok else @@ -74,7 +127,7 @@ log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" - log_msg "$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS" + log_msg "$cc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" @@ -140,7 +193,7 @@ exit 1 } -rm -f x264_config.h config.h config.mak config.log x264.pc conftest +rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest* prefix='/usr/local' exec_prefix='${prefix}' @@ -149,6 +202,10 @@ includedir='${prefix}/include' DEVNULL='/dev/null' +cli="yes" +cli_libx264="internal" +shared="no" +static="no" avs="auto" lavf="auto" ffms="auto" @@ -157,12 +214,14 @@ thread="auto" swscale="auto" asm="auto" +interlaced="yes" debug="no" gprof="no" +strip="no" pic="no" vis="no" -shared="no" bit_depth="8" +compiler="GNU" CFLAGS="$CFLAGS -Wall -I." LDFLAGS="$LDFLAGS" @@ -174,7 +233,7 @@ EXE="" # list of all preprocessor HAVE values we can define -CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL" +CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED" # parse options @@ -196,9 +255,24 @@ --includedir=) includedir="$optarg" ;; + --disable-cli) + cli="no" + ;; + --system-libx264) + cli_libx264="system" + ;; + --enable-shared) + shared="yes" + ;; + --enable-static) + static="yes" + ;; --disable-asm) asm="no" ;; + --disable-interlaced) + interlaced="no" + ;; --disable-avs) avs="no" ;; @@ -240,12 +314,12 @@ LDFLAGS="$LDFLAGS -pg" gprof="yes" ;; + --enable-strip) + strip="yes" + ;; --enable-pic) pic="yes" ;; - --enable-shared) - shared="yes" - ;; --enable-visualize) vis="yes" ;; @@ -273,6 +347,8 @@ esac done +[ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static." + CC="${CC-${cross_prefix}gcc}" AR="${AR-${cross_prefix}ar}" RANLIB="${RANLIB-${cross_prefix}ranlib}" @@ -290,6 +366,26 @@ host_vendor="${host%%-}" host_os="${host#-}" +# test for use of Intel Compiler +if [[ $host_os = mingw \|\| $host_os = cygwin* ]]; then + if [[ `basename "$CC"` = icl* ]]; then + # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths. + [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS" + compiler=ICL + CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -Iextras" + QPRE="-Q" + `$CC 2>&1 \| grep -q IA-32` && host_cpu=i486 + `$CC 2>&1 \| grep -q "Intel(R) 64"` && host_cpu=x86_64 + cpp_check "" "" "_MSC_VER >= 1400" \|\| die "Windows Intel Compiler support requires Visual Studio 2005 or newer" + fi +else + if [[ `basename "$CC"` = icc* ]]; then + AR="xiar" + compiler=ICC + QPRE="-" + fi +fi + case $host_os in beos) SYS="BEOS" @@ -326,16 +422,21 @@ LDFLAGS="$LDFLAGS -lm" ;; cygwin) - SYS="MINGW" EXE=".exe" - DEVNULL="NUL" if cc_check "" -mno-cygwin; then CFLAGS="$CFLAGS -mno-cygwin" LDFLAGS="$LDFLAGS -mno-cygwin" fi + if cpp_check "" "" "defined(__CYGWIN32__)" ; then + define HAVE_MALLOC_H + SYS="CYGWIN" + else + SYS="WINDOWS" + DEVNULL="NUL" + fi ;; mingw) - SYS="MINGW" + SYS="WINDOWS" EXE=".exe" DEVNULL="NUL" ;; @@ -355,15 +456,31 @@ ARCH="X86" AS="yasm" ASFLAGS="$ASFLAGS -O2" - if [[ "$asm" == auto && "$CFLAGS" != -march* ]]; then - CFLAGS="$CFLAGS -march=i686" - fi - if [[ "$asm" == auto && "$CFLAGS" != -mfpmath ]]; then - CFLAGS="$CFLAGS -mfpmath=sse -msse" + if [ $compiler = GNU ]; then + if [[ "$asm" == auto && "$CFLAGS" != -march ]]; then + CFLAGS="$CFLAGS -march=i686" + fi + if [[ "$asm" == auto && "$CFLAGS" != -mfpmath ]]; then + CFLAGS="$CFLAGS -mfpmath=sse -msse" + fi + else + # icc on linux has various degrees of mod16 stack support + if [ $SYS = LINUX ]; then + # < 11 is completely incapable of keeping a mod16 stack + if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then + define BROKEN_STACK_ALIGNMENT + # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so. + elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then + CFLAGS="$CFLAGS -falign-stack=assume-16-byte" + fi + # >= 12 defaults to a mod16 stack + fi + # icl on windows has no mod16 stack support + [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho -DPREFIX" - elif [ "$SYS" = MINGW ]; then + elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" else @@ -379,9 +496,10 @@ CFLAGS="$CFLAGS -arch x86_64" LDFLAGS="$LDFLAGS -arch x86_64" fi - elif [ "$SYS" = MINGW ]; then + elif [ "$SYS" = WINDOWS ]; then ASFLAGS="$ASFLAGS -f win32 -m amd64" - cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" + # only the GNU toolchain is inconsistent in prefixing function names with _ + [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" else ASFLAGS="$ASFLAGS -f elf -m amd64" fi @@ -460,10 +578,14 @@ cc_check \|\| die "No working C compiler found." -if cc_check '' -std=gnu99 ; then - CFLAGS="$CFLAGS -std=gnu99" -elif cc_check '' -std=c99 ; then - CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE" +if [ $compiler != ICL ]; then + if cc_check '' -std=gnu99 'for( int i = 0; i < 9; i++ );' ; then + CFLAGS="$CFLAGS -std=gnu99" + elif cc_check '' -std=c99 'for( int i = 0; i < 9; i++ );' ; then + CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE" + elif ! cc_check '' '' 'for( int i = 0; i < 9; i++ );' ; then + die "C99 compiler is needed for compilation." + fi fi if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" \) ] ; then @@ -472,14 +594,14 @@ if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if ! as_check "vpaddw xmm0, xmm0, xmm0" ; then - VER=`($AS --version \|\| echo no assembler) 2>$DEVNULL \| head -n 1` + VER=`($AS --version \|\| echo no assembler) 2>/dev/null \| head -n 1` echo "Found $VER" echo "Minimum version is yasm-0.7.0" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi if ! cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' ; then - VER=`(${cross_prefix}as --version \|\| echo no gnu as) 2>$DEVNULL \| head -n 1` + VER=`(${cross_prefix}as --version \|\| echo no gnu as) 2>/dev/null \| head -n 1` echo "Found $VER" echo "Minimum version is binutils-2.17" echo "Your compiler can't handle inline SSSE3 asm." @@ -510,18 +632,21 @@ define ARCH_$ARCH define SYS_$SYS -echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c -$CC $CFLAGS conftest.c -c -o conftest.o 2>$DEVNULL \|\| die "endian test failed" -if (${cross_prefix}strings -a conftest.o \| grep -q BIGE) && (${cross_prefix}strings -a conftest.o \| grep -q FPendian) ; then - define WORDS_BIGENDIAN -elif !(${cross_prefix}strings -a conftest.o \| grep -q EGIB && ${cross_prefix}strings -a conftest.o \| grep -q naidnePF) ; then - die "endian test failed" +# skip endianness check for Intel Compiler, as all supported platforms are little. the -ipo flag will also cause the check to fail +if [ $compiler = GNU ]; then + echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c + $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null \|\| die "endian test failed" + if (${cross_prefix}strings -a conftest.o \| grep -q BIGE) && (${cross_prefix}strings -a conftest.o \| grep -q FPendian) ; then + define WORDS_BIGENDIAN + elif !(${cross_prefix}strings -a conftest.o \| grep -q EGIB && ${cross_prefix}strings -a conftest.o \| grep -q naidnePF) ; then + die "endian test failed" + fi fi # autodetect options that weren't forced nor disabled # pthread-win32 is lgpl, prevent its use if --disable-gpl is specified and targeting windows -[ "$SYS" = "MINGW" -a "$gpl" = "no" -a "$thread" = "auto" ] && thread="win32" +[ "$SYS" = "WINDOWS" -a "$gpl" = "no" -a "$thread" = "auto" ] && thread="win32" libpthread="" if [ "$thread" = "auto" ]; then @@ -531,7 +656,7 @@ thread="beos" define HAVE_BEOSTHREAD ;; - MINGW) + WINDOWS) if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then thread="posix" libpthread="-lpthread" @@ -564,7 +689,8 @@ define HAVE_POSIXTHREAD fi if [ "$thread" = "win32" ]; then - if [ "$SYS" = "MINGW" ]; then + # cygwin does not support win32 threads + if [ "$SYS" = "WINDOWS" ]; then define HAVE_WIN32THREAD else thread="no" @@ -590,30 +716,24 @@ if [ "$swscale" = "auto" ] ; then swscale="no" - if ${cross_prefix}pkg-config --exists libswscale 2>$DEVNULL; then + if ${cross_prefix}pkg-config --exists libswscale 2>/dev/null; then SWSCALE_LIBS="$SWSCALE_LIBS $(${cross_prefix}pkg-config --libs libswscale)" SWSCALE_CFLAGS="$SWSCALE_CFLAGS $(${cross_prefix}pkg-config --cflags libswscale)" fi [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil" - error="swscale must be at least version 0.9.0" - if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_getContext(0,0,0,0,0,0,0,0,0,0);" ; then - if cpp_check "libswscale/swscale.h" "$SWSCALE_CFLAGS" "LIBSWSCALE_VERSION_INT >= AV_VERSION_INT(0,9,0)" "$error"; then - # we use colorspaces that were defined in libavutil r19775 - if cc_check "libavutil/pixfmt.h" "$SWSCALE_CFLAGS" "enum PixelFormat pixfmt = PIX_FMT_YUV422P16LE;" ; then - swscale="yes" - else - echo "Warning: libavutil is too old, update to ffmpeg r19775+" - fi + if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then + if cc_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "av_get_pix_fmt_name(0);" ; then + swscale="yes" else - echo "Warning: ${error}" + echo "Warning: av_get_pix_fmt_name is missing from libavutil, update for swscale support" fi fi fi if [ "$lavf" = "auto" ] ; then lavf="no" - if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>$DEVNULL; then + if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>/dev/null; then LAVF_LIBS="$LAVF_LIBS $(${cross_prefix}pkg-config --libs libavformat libavcodec libavutil libswscale)" LAVF_CFLAGS="$LAVF_CFLAGS $(${cross_prefix}pkg-config --cflags libavformat libavcodec libavutil libswscale)" fi @@ -625,15 +745,14 @@ fi LAVF_LIBS="-L. $LAVF_LIBS" if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avcodec_decode_video2(0,0,0,0);" ; then - # libvautil/pixdesc.h included the private header intreadwrite.h until r21854 - if cc_check libavutil/pixdesc.h "$LAVF_CFLAGS $LAVF_LIBS" ; then + if cpp_check libavcodec/avcodec.h "$LAVF_CFLAGS $LAVF_LIBS" "LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(52,64,0)" ; then if [ "$swscale" = "yes" ]; then lavf="yes" else echo "Warning: libavformat is not supported without swscale support" fi else - echo "Warning: libavutil is too old, update to ffmpeg r21854+" + echo "Warning: libavcodec is too old, update to ffmpeg r22735+" fi fi fi @@ -642,7 +761,7 @@ ffms_major="2"; ffms_minor="14"; ffms_micro="0"; ffms_bump="0" ffms="no" - if ${cross_prefix}pkg-config --exists ffms2 2>$DEVNULL; then + if ${cross_prefix}pkg-config --exists ffms2 2>/dev/null; then FFMS2_LIBS="$FFMS2_LIBS $(${cross_prefix}pkg-config --libs ffms2)" FFMS2_CFLAGS="$FFMS2_CFLAGS $(${cross_prefix}pkg-config --cflags ffms2)" fi @@ -682,12 +801,12 @@ fi fi -GPAC_LIBS="-lgpac_static" -if [ $SYS = MINGW ]; then - GPAC_LIBS="$GPAC_LIBS -lwinmm" -fi if [ "$gpac" = "auto" ] ; then gpac="no" + cc_check "" -lz && GPAC_LIBS="-lgpac_static -lz" \|\| GPAC_LIBS="-lgpac_static" + if [ "$SYS" = "WINDOWS" ] ; then + GPAC_LIBS="$GPAC_LIBS -lwinmm" + fi if cc_check gpac/isomedia.h "$GPAC_LIBS" ; then if cc_check gpac/isomedia.h "$GPAC_LIBS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0);" ; then gpac="yes" @@ -706,12 +825,15 @@ if [ "$avs" = "auto" ] ; then avs="no" - if [ $SYS = MINGW ] && cc_check extras/avisynth_c.h ; then + # cygwin can use avisynth if it can use LoadLibrary + if [ $SYS = WINDOWS ] \|\| ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then avs="yes" define HAVE_AVS fi fi +cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT + if [ "$pic" = "yes" ] ; then CFLAGS="$CFLAGS -fPIC" ASFLAGS="$ASFLAGS -DPIC" @@ -720,7 +842,11 @@ fi if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then - CFLAGS="$CFLAGS -s -fomit-frame-pointer" + CFLAGS="$CFLAGS -fomit-frame-pointer" +fi + +if [ "$strip" = "yes" ]; then + CFLAGS="$CFLAGS -s" LDFLAGS="$LDFLAGS -s" fi @@ -738,7 +864,7 @@ CFLAGS="$CFLAGS -fno-tree-vectorize" fi -if [ $SYS = MINGW -a $ARCH = X86 ] ; then +if [ $SYS = WINDOWS -a $ARCH = X86 -a $compiler = GNU ] ; then # workaround gcc/ld bug with alignment of static variables/arrays that are initialized to zero cc_check '' -fno-zero-initialized-in-bss && CFLAGS="$CFLAGS -fno-zero-initialized-in-bss" fi @@ -749,6 +875,9 @@ elif cc_check "stdio.h" "" "fseeko64(stdin,0,0);" ; then define fseek fseeko64 define ftell ftello64 +elif cc_check "stdio.h" "" "_fseeki64(stdin,0,0);" ; then + define fseek _fseeki64 + define ftell _ftelli64 fi if cc_check '' -Wshadow ; then @@ -764,18 +893,60 @@ [ $gpl = yes ] && define HAVE_GPL && x264_gpl=1 \|\| x264_gpl=0 +[ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 \|\| x264_interlaced=0 + #define undefined vars as 0 for var in $CONFIG_HAVE; do grep -q "HAVE_$var 1" config.h \|\| define HAVE_$var 0 done +if [ $compiler = ICL ]; then + AR="xilib -nologo -out:" + DEPMM=-QMM + DEPMT=-QMT + HAVE_GETOPT_LONG=0 + LD="xilink -out:" + LDFLAGS="-nologo -incremental:no $(icl_ldflags $LDFLAGS)" + LDFLAGSCLI="$(icl_ldflags $LDFLAGSCLI)" + LIBX264=libx264.lib + RANLIB= + STRIP= + if [ $debug = yes ]; then + LDFLAGS="-debug $LDFLAGS" + CFLAGS="-D_DEBUG $CFLAGS" + else + CFLAGS="-DNDEBUG $CFLAGS" + fi +else + AR="$AR rc " + DEPMM="-MM -g0" + DEPMT="-MT" + LD="$CC -o " + LIBX264=libx264.a +fi +if [ $compiler = GNU ]; then + PROF_GEN_CC="-fprofile-generate" + PROF_GEN_LD="-fprofile-generate" + PROF_USE_CC="-fprofile-use" + PROF_USE_LD="-fprofile-use" +else + CFLAGS="$(intel_cflags $CFLAGS)" + # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP + [ \( $ARCH = X86_64 -o $ARCH = X86 \) -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__ + PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir." + PROF_GEN_LD= + PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir." + PROF_USE_LD= +fi + rm -f conftest* # generate exported config file cat > x264_config.h << EOF -#define X264_BIT_DEPTH $bit_depth -#define X264_GPL $x264_gpl +#define X264_BIT_DEPTH $bit_depth +#define X264_GPL $x264_gpl +#define X264_INTERLACED $x264_interlaced EOF # generate config files @@ -790,8 +961,11 @@ SYS=$SYS CC=$CC CFLAGS=$CFLAGS +DEPMM=$DEPMM +DEPMT=$DEPMT +LD=$LD LDFLAGS=$LDFLAGS -LDFLAGSCLI=$LDFLAGSCLI +LIBX264=$LIBX264 AR=$AR RANLIB=$RANLIB STRIP=$STRIP @@ -800,30 +974,74 @@ EXE=$EXE HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG DEVNULL=$DEVNULL +PROF_GEN_CC=$PROF_GEN_CC +PROF_GEN_LD=$PROF_GEN_LD +PROF_USE_CC=$PROF_USE_CC +PROF_USE_LD=$PROF_USE_LD EOF +if [ $compiler = ICL ]; then + echo '%.o: %.c' >> config.mak + echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak +fi + +if [ "$cli" = "yes" ]; then + echo 'default: cli' >> config.mak + echo 'install: install-cli' >> config.mak +fi + if [ "$shared" = "yes" ]; then API=$(grep '#define X264_BUILD' < x264.h \| cut -f 3 -d ' ') - if [ "$SYS" = "MINGW" ]; then + if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then echo "SONAME=libx264-$API.dll" >> config.mak - echo 'IMPLIBNAME=libx264.dll.a' >> config.mak - echo 'SOFLAGS=-Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak + if [ $compiler = ICL ]; then + echo 'IMPLIBNAME=libx264.dll.lib' >> config.mak + # GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations + # MSVC link does not act similarly, so it is required to make an export definition out of x264.h and use it at link time + echo 'SOFLAGS=-dll -def:x264.def -implib:$(IMPLIBNAME)' >> config.mak + echo "EXPORTS" > x264.def + grep "^\(int\\|void\\|x264_t\\|extern\).x264.[\[(;]" x264.h \| sed -e "s/.\(x264.\)[\[(]./\1/;s/.\(x264.*\);/\1/;s/open/open_$API/g" >> x264.def + else + echo 'IMPLIBNAME=libx264.dll.a' >> config.mak + echo 'SOFLAGS=-shared -Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak + fi elif [ "$SYS" = "MACOSX" ]; then echo "SOSUFFIX=dylib" >> config.mak echo "SONAME=libx264.$API.dylib" >> config.mak - echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak + echo 'SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak elif [ "$SYS" = "SunOS" ]; then echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak - echo 'SOFLAGS=-Wl,-h,$(SONAME)' >> config.mak + echo 'SOFLAGS=-shared -Wl,-h,$(SONAME)' >> config.mak else echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak - echo 'SOFLAGS=-Wl,-soname,$(SONAME)' >> config.mak + echo 'SOFLAGS=-shared -Wl,-soname,$(SONAME)' >> config.mak fi - echo 'default: $(SONAME)' >> config.mak + echo 'default: lib-shared' >> config.mak + echo 'install: install-lib-shared' >> config.mak fi +if [ "$static" = "yes" ]; then + echo 'default: lib-static' >> config.mak + echo 'install: install-lib-static' >> config.mak +fi + +if [ "$cli_libx264" = "system" ] ; then + if [ "$shared" = "yes" ]; then + CLI_LIBX264='$(SONAME)' + elif ${cross_prefix}pkg-config --exists x264 2>/dev/null; then + LDFLAGSCLI="$LDFLAGSCLI $(${cross_prefix}pkg-config --libs x264)" + CLI_LIBX264= + else + die "Can not find system libx264" + fi +else + CLI_LIBX264='$(LIBX264)' +fi +echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak +echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak + ./version.sh >> config.h pclibs="-L$libdir -lx264 $libpthread" @@ -849,7 +1067,12 @@ cat > conftest.log <<EOF Platform: $ARCH System: $SYS +cli: $cli +libx264: $cli_libx264 +shared: $shared +static: $static asm: $asm +interlaced: $interlaced avs: $avs lavf: $lavf ffms: $ffms @@ -859,8 +1082,8 @@ filters: $filters debug: $debug gprof: $gprof +strip: $strip PIC: $pic -shared: $shared visualize: $vis bit depth: $bit_depth EOF
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/analyse.c ^
@@ -26,8 +26,6 @@ ****************************************************************************/ #define _ISOC99_SOURCE -#include <math.h> -#include <unistd.h> #include "common/common.h" #include "macroblock.h" @@ -138,7 +136,8 @@ } x264_mb_analysis_t; / lambda = pow(2,qp/6-2) / -const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = { +const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = +{ 1, 1, 1, 1, 1, 1, 1, 1, / 0- 7 / 1, 1, 1, 1, 1, 1, 1, 1, / 8-15 / 2, 2, 2, 2, 3, 3, 3, 4, / 16-23 / @@ -154,7 +153,8 @@ / lambda2 = pow(lambda,2) * .9 * 256 / / Capped to avoid overflow / -const int x264_lambda2_tab[QP_MAX_MAX+1] = { +const int x264_lambda2_tab[QP_MAX_MAX+1] = +{ 14, 18, 22, 28, 36, 45, 57, 72, / 0- 7 / 91, 115, 145, 182, 230, 290, 365, 460, / 8-15 / 580, 731, 921, 1161, 1462, 1843, 2322, 2925, / 16-23 / @@ -168,14 +168,16 @@ 134217727,134217727,134217727,134217727,134217727,134217727, / 76-81 / }; -const uint8_t x264_exp2_lut[64] = { +const uint8_t x264_exp2_lut[64] = +{ 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102, 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170, 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250 }; -const float x264_log2_lut[128] = { +const float x264_log2_lut[128] = +{ 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682, 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987, 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840, @@ -195,13 +197,15 @@ }; / Avoid an int/float conversion. / -const float x264_log2_lz_lut[32] = { +const float x264_log2_lz_lut[32] = +{ 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; // should the intra and inter lambdas be different? // I'm just matching the behaviour of deadzone quant. -static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = { +static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = +{ // inter lambda = .85 .85 * 2*(qp/3. + 10 - LAMBDA_BITS) { 46, 58, 73, 92, 117, 147, @@ -238,7 +242,8 @@ }; #define MAX_CHROMA_LAMBDA_OFFSET 36 -static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = { +static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = +{ 16, 20, 25, 32, 40, 50, 64, 80, 101, 128, 161, 203, 256, 322, 406, 512, 645, 812, @@ -249,16 +254,20 @@ }; / TODO: calculate CABAC costs / -static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = { +static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = +{ 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 }; -static const uint8_t i_mb_b16x8_cost_table[17] = { +static const uint8_t i_mb_b16x8_cost_table[17] = +{ 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9 }; -static const uint8_t i_sub_mb_b_cost_table[13] = { +static const uint8_t i_sub_mb_b_cost_table[13] = +{ 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1 }; -static const uint8_t i_sub_mb_p_cost_table[4] = { +static const uint8_t i_sub_mb_p_cost_table[4] = +{ 5, 3, 3, 1 }; @@ -267,7 +276,18 @@ static uint16_t x264_cost_ref[QP_MAX+1][3][33]; static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; -int x264_analyse_init_costs( x264_t h, int qp ) +float x264_analyse_prepare_costs( x264_t h ) +{ + float logs = x264_malloc( (242048+1)sizeof(float) ); + if( !logs ) + return NULL; + logs[0] = 0.718f; + for( int i = 1; i <= 242048; i++ ) + logs[i] = log2f(i+1)2 + 1.718f; + return logs; +} + +int x264_analyse_init_costs( x264_t h, float logs, int qp ) { int lambda = x264_lambda_tab[qp]; if( h->cost_mv[qp] ) @@ -278,7 +298,7 @@ for( int i = 0; i <= 242048; i++ ) { h->cost_mv[qp][-i] = - h->cost_mv[qp][i] = X264_MIN( lambda (log2f(i+1)2 + 0.718f + !!i) + .5f, (1<<16)-1 ); + h->cost_mv[qp][i] = X264_MIN( lambda logs[i] + .5f, (1<<16)-1 ); } x264_pthread_mutex_lock( &cost_ref_mutex ); for( int i = 0; i < 3; i++ ) @@ -320,7 +340,7 @@ { x264_frame_t frame = h->fref[0][j]; int width = frame->i_width[0] + 2PADH; - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int offset, height; pixel src = frame->filtered[0] - frame->i_stride[0]i_padv - PADH; height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv2 ) - h->fenc->i_lines_weighted; @@ -428,7 +448,7 @@ / Calculate max allowed MV range / #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 ) h->mb.mv_min[0] = 4( -16h->mb.i_mb_x - 24 ); - h->mb.mv_max[0] = 4( 16( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); + h->mb.mv_max[0] = 4( 16( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] ); h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] ); if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P ) @@ -441,15 +461,14 @@ } h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; - if( h->mb.i_mb_x == 0 ) + if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) ) { - int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff; - int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff; + int mb_y = h->mb.i_mb_y >> SLICE_MBAFF; int thread_mvy_range = i_fmv_range; if( h->i_thread_frames > 1 ) { - int pix_y = (h->mb.i_mb_y \| h->mb.b_interlaced) 16; + int pix_y = (h->mb.i_mb_y \| PARAM_INTERLACED) * 16; int thresh = pix_y + h->param.analyse.i_mv_range_thread; for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- ) for( int j = 0; j < h->i_ref[i]; j++ ) @@ -460,19 +479,48 @@ if( h->param.b_deterministic ) thread_mvy_range = h->param.analyse.i_mv_range_thread; - if( h->mb.b_interlaced ) + if( PARAM_INTERLACED ) thread_mvy_range >>= 1; x264_analyse_weight_frame( h, pix_y + thread_mvy_range ); } - h->mb.mv_min[1] = 4( -16mb_y - 24 ); - h->mb.mv_max[1] = 4( 16( mb_height - mb_y - 1 ) + 24 ); - h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); - h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); - h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range4 ); - h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; - h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; + if( PARAM_INTERLACED ) + { + / 0 == top progressive, 1 == bot progressive, 2 == interlaced / + for( int i = 0; i < 3; i++ ) + { + int j = i == 2; + mb_y = (h->mb.i_mb_y >> j) + (i == 1); + h->mb.mv_miny_row[i] = 4( -16mb_y - 24 ); + h->mb.mv_maxy_row[i] = 4( 16( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 ); + h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range ); + h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] ); + h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range4 ); + h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border; + h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border; + } + } + else + { + h->mb.mv_min[1] = 4( -16mb_y - 24 ); + h->mb.mv_max[1] = 4( 16( h->mb.i_mb_height - mb_y - 1 ) + 24 ); + h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); + h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); + h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range4 ); + h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; + h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; + } + } + if( PARAM_INTERLACED ) + { + int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1; + h->mb.mv_min[1] = h->mb.mv_miny_row[i]; + h->mb.mv_max[1] = h->mb.mv_maxy_row[i]; + h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i]; + h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i]; + h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i]; + h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i]; } #undef CLIP_FMV @@ -516,7 +564,7 @@ { / Always run in fast-intra mode for subme < 3 / if( h->mb.i_subpel_refine > 2 && - ( IS_INTRA( h->mb.i_mb_type_left ) \|\| + ( IS_INTRA( h->mb.i_mb_type_left[0] ) \|\| IS_INTRA( h->mb.i_mb_type_top ) \|\| IS_INTRA( h->mb.i_mb_type_topleft ) \|\| IS_INTRA( h->mb.i_mb_type_topright ) \|\| @@ -1296,7 +1344,7 @@ / early termination: if 16x16 chose ref 0, then evalute no refs older * than those used by the neighbors / if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 \|\| a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) && - h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 ) + h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 ) { i_maxref = 0; CHECK_NEIGHBOUR( -8 - 1 ); @@ -1565,7 +1613,7 @@ const int or = 8(i8x8&1) + 2(i8x8&2)i_stride; const int oe = 4(i8x8&1) + 2(i8x8&2)FENC_STRIDE; const int i_ref = a->l0.me8x8[i8x8].i_ref; - const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; + const int mvy_offset = MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; x264_weight_t weight = h->sh.weight[i_ref]; // FIXME weight can be done on 4x4 blocks even if mc is smaller @@ -1722,8 +1770,8 @@ #define COST_BI_CHROMA( m0, m1, width, height ) \ { \ - l0_mvy_offset = h->mb.b_interlaced & m0.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; \ - l1_mvy_offset = h->mb.b_interlaced & m1.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; \ + l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; \ + l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; \ h->mc.mc_chroma( pix[0], pix[1], 8, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \ h->mc.mc_chroma( pix[2], pix[3], 8, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \ h->mc.avg[i_pixel+3]( bi[0], 8, pix[0], 8, pix[2], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ @@ -1907,18 +1955,18 @@ ALIGNED_ARRAY_16( pixel, pixuv, [2],[8FENC_STRIDE] ); ALIGNED_ARRAY_16( pixel, bi, [8FENC_STRIDE] ); - if( h->mb.b_interlaced & a->l0.bi16x16.i_ref ) + if( MB_INTERLACED & a->l0.bi16x16.i_ref ) { - int l0_mvy_offset = h->mb.b_interlaced & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; + int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 ); } else h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] ); - if( h->mb.b_interlaced & a->l1.bi16x16.i_ref ) + if( MB_INTERLACED & a->l1.bi16x16.i_ref ) { - int l1_mvy_offset = h->mb.b_interlaced & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; + int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 ); } @@ -2063,7 +2111,7 @@ { x264_mb_analysis_list_t lX = l ? &a->l1 : &a->l0; if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 && - h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 ) + h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 ) { i_maxref[l] = 0; CHECK_NEIGHBOUR( -8 - 1 ); @@ -2809,15 +2857,19 @@ } else { + int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1]; + / If the current macroblock is off the frame, just skip it. / + if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y 16 >= h->param.i_height && !skip_invalid ) + b_skip = 1; /* Fast P_SKIP detection / - if( h->param.analyse.b_fast_pskip ) + else if( h->param.analyse.b_fast_pskip ) { - if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] ) + if( skip_invalid ) // FIXME don't need to check this if the reference frame is done {} else if( h->param.analyse.i_subpel_refine >= 3 ) analysis.b_try_skip = 1; - else if( h->mb.i_mb_type_left == P_SKIP \|\| + else if( h->mb.i_mb_type_left[0] == P_SKIP \|\| h->mb.i_mb_type_top == P_SKIP \|\| h->mb.i_mb_type_topleft == P_SKIP \|\| h->mb.i_mb_type_topright == P_SKIP ) @@ -3139,7 +3191,10 @@ { if( !h->mb.b_direct_auto_write ) x264_mb_mc( h ); - if( analysis.i_mbrd ) + / If the current macroblock is off the frame, just skip it. / + if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y 16 >= h->param.i_height ) + b_skip = 1; + else if( analysis.i_mbrd ) { i_bskip_cost = ssd_mb( h ); /* 6 = minimum cavlc cost of a non-skipped MB / @@ -3657,8 +3712,8 @@ int ref = h->mb.cache.ref[l][x264_scan8[0]]; if( ref < 0 ) continue; - completed = h->fref[l][ ref >> h->mb.b_interlaced ]->orig->i_lines_completed; - if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y16 > completed ) + completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed; + if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed ) { x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n"); x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/analyse.h ^
@@ -27,7 +27,8 @@ #ifndef X264_ANALYSE_H #define X264_ANALYSE_H -int x264_analyse_init_costs( x264_t h, int qp ); +float x264_analyse_prepare_costs( x264_t h ); +int x264_analyse_init_costs( x264_t h, float logs, int qp ); void x264_analyse_free_costs( x264_t h ); void x264_analyse_weight_frame( x264_t h, int end ); void x264_macroblock_analyse( x264_t h );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/cabac.c ^
@@ -66,20 +66,36 @@ } } +#if !RDO_SKIP_BS +static void x264_cabac_field_decoding_flag( x264_t h, x264_cabac_t cb ) +{ + int ctx = 0; + ctx += h->mb.field_decoding_flag & !!h->mb.i_mb_x; + ctx += (h->mb.i_mb_top_mbpair_xy >= 0 + && h->mb.slice_table[h->mb.i_mb_top_mbpair_xy] == h->sh.i_first_mb + && h->mb.field[h->mb.i_mb_top_mbpair_xy]); + + x264_cabac_encode_decision_noup( cb, 70 + ctx, MB_INTERLACED ); + h->mb.field_decoding_flag = MB_INTERLACED; +} +#endif + static void x264_cabac_mb_type( x264_t h, x264_cabac_t cb ) { const int i_mb_type = h->mb.i_type; - if( h->sh.b_mbaff && +#if !RDO_SKIP_BS + if( SLICE_MBAFF && (!(h->mb.i_mb_y & 1) \|\| IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) { - x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced ); + x264_cabac_field_decoding_flag( h, cb ); } +#endif if( h->sh.i_type == SLICE_TYPE_I ) { int ctx = 0; - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != I_4x4 ) + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != I_4x4 ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 ) ctx++; @@ -113,7 +129,7 @@ else //if( h->sh.i_type == SLICE_TYPE_B ) { int ctx = 0; - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT ) + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != B_SKIP && h->mb.i_mb_type_left[0] != B_DIRECT ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) ctx++; @@ -198,7 +214,7 @@ int ctx = 0; /* No need to test for I4x4 or I_16x16 as cache_save handle that / - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy] != 0 ) + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 ) ctx++; @@ -280,9 +296,9 @@ #if !RDO_SKIP_BS void x264_cabac_mb_skip( x264_t h, int b_skip ) { - int ctx = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left )) - + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top )) - + (h->sh.i_type == SLICE_TYPE_P ? 11 : 24); + int ctx = h->mb.cache.i_neighbour_skip + 11; + if( h->sh.i_type != SLICE_TYPE_P ) + ctx += 13; x264_cabac_encode_decision( &h->cabac, ctx, b_skip ); } #endif @@ -335,7 +351,7 @@ const int i8 = x264_scan8[idx]; const int i_refa = h->mb.cache.ref[i_list][i8 - 1]; const int i_refb = h->mb.cache.ref[i_list][i8 - 8]; - int ctx = 0; + int ctx = 0; if( i_refa > 0 && !h->mb.cache.skip[i8 - 1] ) ctx++; @@ -365,7 +381,7 @@ for( int i = 1; i < i_abs; i++ ) x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 ); x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 ); - x264_cabac_encode_bypass( cb, mvd < 0 ); + x264_cabac_encode_bypass( cb, mvd >> 31 ); } else { @@ -405,12 +421,12 @@ x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); } - x264_cabac_encode_bypass( cb, mvd < 0 ); + x264_cabac_encode_bypass( cb, mvd >> 31 ); } #endif - /* Since we don't need to keep track of MVDs larger than 33, just cap the value. + /* Since we don't need to keep track of MVDs larger than 66, just cap the value. * This lets us store MVDs as 8-bit values instead of 16-bit. / - return X264_MIN( i_abs, 33 ); + return X264_MIN( i_abs, 66 ); } static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t h, x264_cabac_t cb, int i_list, int idx, int width ) @@ -510,11 +526,13 @@ } -static const uint16_t significant_coeff_flag_offset[2][6] = { +static const uint16_t significant_coeff_flag_offset[2][6] = +{ { 105, 120, 134, 149, 152, 402 }, { 277, 292, 306, 321, 324, 436 } }; -static const uint16_t last_coeff_flag_offset[2][6] = { +static const uint16_t last_coeff_flag_offset[2][6] = +{ { 166, 181, 195, 210, 213, 417 }, { 338, 353, 367, 382, 385, 451 } }; @@ -532,7 +550,8 @@ 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }}; -static const uint8_t last_coeff_flag_offset_8x8[63] = { +static const uint8_t last_coeff_flag_offset_8x8[63] = +{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, @@ -556,78 +575,70 @@ #if !RDO_SKIP_BS static void block_residual_write_cabac( x264_t h, x264_cabac_t cb, int ctx_block_cat, dctcoef l ) { - const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; - const uint8_t sig_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; - int i_coeff_abs_m1[64]; - int i_coeff_sign[64]; - int i_coeff = 0; - int i_last; - int node_ctx = 0; - int i = 0; + const uint8_t sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; + int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int coeff_idx = -1, node_ctx = 0, last; + int coeffs[64]; - i_last = h->quantf.coeff_last[ctx_block_cat](l); + last = h->quantf.coeff_last[ctx_block_cat]( l ); #define WRITE_SIGMAP( l8x8 )\ - while(1)\ + int i = 0;\ + while( 1 )\ {\ if( l[i] )\ {\ - i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\ - i_coeff_sign[i_coeff] = l[i] < 0;\ - i_coeff++;\ - x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\ - if( i == i_last )\ + coeffs[++coeff_idx] = l[i];\ + x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\ + if( i == last )\ {\ - x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\ + x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\ break;\ }\ else\ - x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\ + x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\ }\ else\ - x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\ + x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\ i++;\ - if( i == i_count_m1 )\ + if( i == count_m1 )\ {\ - i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\ - i_coeff_sign[i_coeff] = l[i] < 0;\ - i_coeff++;\ + coeffs[++coeff_idx] = l[i];\ break;\ }\ } if( ctx_block_cat == DCT_LUMA_8x8 ) { - const int i_count_m1 = 63; + int count_m1 = 63; WRITE_SIGMAP( 1 ) } else { - const int i_count_m1 = count_cat_m1[ctx_block_cat]; + int count_m1 = count_cat_m1[ctx_block_cat]; WRITE_SIGMAP( 0 ) } do { - int i_prefix, ctx; - i_coeff--; - /* write coeff_abs - 1 / - i_prefix = X264_MIN( i_coeff_abs_m1[i_coeff], 14 ); - ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; + int coeff = coeffs[coeff_idx]; + int abs_coeff = abs(coeff); + int coeff_sign = coeff >> 31; + int ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; - if( i_prefix ) + if( abs_coeff > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; - for( i = 0; i < i_prefix - 1; i++ ) + ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level; + for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- ) x264_cabac_encode_decision( cb, ctx, 1 ); - if( i_prefix < 14 ) + if( abs_coeff < 15 ) x264_cabac_encode_decision( cb, ctx, 0 ); else - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 ); + x264_cabac_encode_ue_bypass( cb, 0, abs_coeff - 15 ); node_ctx = coeff_abs_level_transition[1][node_ctx]; } @@ -637,8 +648,8 @@ node_ctx = coeff_abs_level_transition[0][node_ctx]; } - x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] ); - } while( i_coeff > 0 ); + x264_cabac_encode_bypass( cb, coeff_sign ); + } while( --coeff_idx >= 0 ); } #define block_residual_write_cabac_8x8( h, cb, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, l ) @@ -650,37 +661,35 @@ for this (~0.001db) and the speed boost (~30%) is worth it. / static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t h, x264_cabac_t cb, int ctx_block_cat, dctcoef l, int b_8x8 ) { - const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][ctx_block_cat]; - const int i_ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; - const uint8_t sig_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; - int i_last, i_coeff_abs, ctx, node_ctx; - - i_last = h->quantf.coeff_last[ctx_block_cat](l); - - i_coeff_abs = abs(l[i_last]); - ctx = coeff_abs_level1_ctx[0] + i_ctx_level; + const uint8_t sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; + int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int last = h->quantf.coeff_last[ctx_block_cat]( l ); + int coeff_abs = abs(l[last]); + int ctx = coeff_abs_level1_ctx[0] + ctx_level; + int node_ctx; - if( i_last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) ) + if( last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) ) { - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?sig_offset[i_last]:i_last), 1 ); - x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : last), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : last), 1 ); } - if( i_coeff_abs > 1 ) + if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level; - if( i_coeff_abs < 15 ) + ctx = coeff_abs_levelgt1_ctx[0] + ctx_level; + if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[i_coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs - 15 ); + x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][0]; } @@ -691,29 +700,29 @@ x264_cabac_encode_bypass( cb, 0 ); // sign } - for( int i = i_last-1 ; i >= 0; i-- ) + for( int i = last-1 ; i >= 0; i-- ) { if( l[i] ) { - i_coeff_abs = abs(l[i]); - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?sig_offset[i]:i), 1 ); - x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 ); - ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; + coeff_abs = abs(l[i]); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 ); + ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; - if( i_coeff_abs > 1 ) + if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; - if( i_coeff_abs < 15 ) + ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level; + if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[i_coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs - 15 ); + x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][node_ctx]; } @@ -725,7 +734,7 @@ } } else - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?sig_offset[i]:i), 0 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 0 ); } }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/cavlc.c ^
@@ -100,9 +100,8 @@ /* Weight highly against overflows. / s->i_bits_encoded += 2000; #else - x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code ); - / clip level, preserving sign / - i_level_code = (1<<12) - 2 + (i_level_code & 1); + / We've had an overflow; note it down and re-encode the MB later. */ + h->mb.b_overflow = 1; #endif } } @@ -296,10 +295,10 @@ int i_mb_pos_tex; #endif - if( h->sh.b_mbaff + if( SLICE_MBAFF && (!(h->mb.i_mb_y & 1) \|\| IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) { - bs_write1( s, h->mb.b_interlaced ); + bs_write1( s, MB_INTERLACED ); } #if !RDO_SKIP_BS
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/encoder.c ^
@@ -25,8 +25,6 @@ * For more information, contact us at licensing@x264.com. ****************************************************************************/ -#include <math.h> - #include "common/common.h" #include "set.h" @@ -104,7 +102,7 @@ sh->i_frame_num = i_frame; - sh->b_mbaff = h->param.b_interlaced; + sh->b_mbaff = PARAM_INTERLACED; sh->b_field_pic = 0; / no field support for now / sh->b_bottom_field = 0; / not yet used / @@ -183,8 +181,10 @@ { if( sh->b_mbaff ) { - assert( sh->i_first_mb % (2sh->sps->i_mb_width) == 0 ); - bs_write_ue( s, sh->i_first_mb >> 1 ); + int first_x = sh->i_first_mb % sh->sps->i_mb_width; + int first_y = sh->i_first_mb / sh->sps->i_mb_width; + assert( (first_y&1) == 0 ); + bs_write_ue( s, (2first_x + sh->sps->i_mb_width(first_y&~1) + (first_y&1)) >> 1 ); } else bs_write_ue( s, sh->i_first_mb ); @@ -335,8 +335,9 @@ static int x264_bitstream_check_buffer( x264_t h ) { uint8_t bs_bak = h->out.p_bitstream; - if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < 2500)) \|\| - (h->out.bs.p_end - h->out.bs.p < 2500) ) + int max_mb_size = 2500 << SLICE_MBAFF; + if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < max_mb_size)) \|\| + (h->out.bs.p_end - h->out.bs.p < max_mb_size) ) { h->out.i_bitstream += 100000; CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream ); @@ -383,15 +384,15 @@ * ***************************************************************************/ -static int x264_validate_parameters( x264_t h ) +static int x264_validate_parameters( x264_t h, int b_open ) { #if HAVE_MMX #ifdef __SSE__ - if( !(x264_cpu_detect() & X264_CPU_SSE) ) + if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n"); #else - if( !(x264_cpu_detect() & X264_CPU_MMXEXT) ) + if( b_open && !(x264_cpu_detect() & X264_CPU_MMXEXT) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n"); #endif @@ -419,6 +420,16 @@ return -1; } +#if HAVE_INTERLACED + h->param.b_interlaced = !!PARAM_INTERLACED; +#else + if( h->param.b_interlaced ) + { + x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" ); + return -1; + } +#endif + if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width \|\| (h->param.crop_rect.i_top + h->param.crop_rect.i_bottom) >= h->param.i_height ) { @@ -457,23 +468,10 @@ h->param.analyse.i_weighted_pred = 0; } - if( h->param.b_interlaced ) - { - if( h->param.analyse.i_me_method >= X264_ME_ESA ) - { - x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" ); - h->param.analyse.i_me_method = X264_ME_UMH; - } - if( h->param.analyse.i_weighted_pred > 0 ) - { - x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" ); - h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE; - } - } - h->param.i_frame_packing = x264_clip3( h->param.i_frame_packing, -1, 5 ); / Detect default ffmpeg settings and terminate with an error. / + if( b_open ) { int score = 0; score += h->param.analyse.i_me_range == 0; @@ -502,7 +500,11 @@ return -1; } h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, -QP_BD_OFFSET, 51 ); + h->param.rc.f_rf_constant_max = x264_clip3f( h->param.rc.f_rf_constant_max, -QP_BD_OFFSET, 51 ); h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX ); + h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 10 ); + h->param.rc.f_ip_factor = X264_MAX( h->param.rc.f_ip_factor, 0.01f ); + h->param.rc.f_pb_factor = X264_MAX( h->param.rc.f_pb_factor, 0.01f ); if( h->param.rc.i_rc_method == X264_RC_CRF ) { h->param.rc.i_qp_constant = h->param.rc.f_rf_constant + QP_BD_OFFSET; @@ -538,9 +540,15 @@ h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX ); h->param.rc.i_aq_mode = 0; h->param.rc.b_mb_tree = 0; + h->param.rc.i_bitrate = 0; } h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX ); h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max ); + h->param.rc.i_qp_step = x264_clip3( h->param.rc.i_qp_step, 0, QP_MAX ); + h->param.rc.i_bitrate = x264_clip3( h->param.rc.i_bitrate, 0, 2000000 ); + h->param.rc.i_vbv_buffer_size = x264_clip3( h->param.rc.i_vbv_buffer_size, 0, 2000000 ); + h->param.rc.i_vbv_max_bitrate = x264_clip3( h->param.rc.i_vbv_max_bitrate, 0, 2000000 ); + h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init, 0, 2000000 ); if( h->param.rc.i_vbv_buffer_size ) { if( h->param.rc.i_rc_method == X264_RC_CQP ) @@ -575,49 +583,58 @@ h->param.rc.i_vbv_max_bitrate = 0; } - if( h->param.b_interlaced && h->param.i_slice_max_size ) - { - x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" ); - h->param.i_slice_max_size = 0; - } - if( h->param.b_interlaced && h->param.i_slice_max_mbs ) - { - x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" ); - h->param.i_slice_max_mbs = 0; - } - int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced); + h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); + h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); + + int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED); if( h->param.b_sliced_threads ) h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices ); else { h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices ); - h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); - h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); if( h->param.i_slice_max_mbs \|\| h->param.i_slice_max_size ) h->param.i_slice_count = 0; } + if( h->param.b_bluray_compat ) + { + h->param.i_bframe_pyramid = X264_MIN( X264_B_PYRAMID_STRICT, h->param.i_bframe_pyramid ); + h->param.i_bframe = X264_MIN( h->param.i_bframe, 3 ); + h->param.b_aud = 1; + h->param.i_nal_hrd = X264_MAX( h->param.i_nal_hrd, X264_NAL_HRD_VBR ); + h->param.i_slice_max_size = 0; + h->param.i_slice_max_mbs = 0; + h->param.b_intra_refresh = 0; + h->param.i_frame_reference = X264_MIN( h->param.i_frame_reference, 6 ); + h->param.i_dpb_size = X264_MIN( h->param.i_dpb_size, 6 ); + / Due to the proliferation of broken players that don't handle dupes properly. / + h->param.analyse.i_weighted_pred = X264_MIN( h->param.analyse.i_weighted_pred, X264_WEIGHTP_SIMPLE ); + if( h->param.b_fake_interlaced ) + h->param.b_pic_struct = 1; + } + h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, X264_REF_MAX ); h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, X264_REF_MAX ); if( h->param.i_scenecut_threshold < 0 ) h->param.i_scenecut_threshold = 0; + h->param.analyse.i_direct_mv_pred = x264_clip3( h->param.analyse.i_direct_mv_pred, X264_DIRECT_PRED_NONE, X264_DIRECT_PRED_AUTO ); if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL ) { x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" ); h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; } h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_MIN( X264_BFRAME_MAX, h->param.i_keyint_max-1 ) ); - h->param.i_open_gop = x264_clip3( h->param.i_open_gop, X264_OPEN_GOP_NONE, X264_OPEN_GOP_BLURAY ); h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 ); if( h->param.i_bframe <= 1 ) h->param.i_bframe_pyramid = X264_B_PYRAMID_NONE; h->param.i_bframe_pyramid = x264_clip3( h->param.i_bframe_pyramid, X264_B_PYRAMID_NONE, X264_B_PYRAMID_NORMAL ); + h->param.i_bframe_adaptive = x264_clip3( h->param.i_bframe_adaptive, X264_B_ADAPT_NONE, X264_B_ADAPT_TRELLIS ); if( !h->param.i_bframe ) { h->param.i_bframe_adaptive = X264_B_ADAPT_NONE; h->param.analyse.i_direct_mv_pred = 0; h->param.analyse.b_weighted_bipred = 0; - h->param.i_open_gop = X264_OPEN_GOP_NONE; + h->param.b_open_gop = 0; } if( h->param.b_intra_refresh && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL ) { @@ -630,10 +647,10 @@ h->param.i_frame_reference = 1; h->param.i_dpb_size = 1; } - if( h->param.b_intra_refresh && h->param.i_open_gop ) + if( h->param.b_intra_refresh && h->param.b_open_gop ) { x264_log( h, X264_LOG_WARNING, "intra-refresh is not compatible with open-gop\n" ); - h->param.i_open_gop = X264_OPEN_GOP_NONE; + h->param.b_open_gop = 0; } float fps = h->param.i_fps_num > 0 && h->param.i_fps_den > 0 ? (float) h->param.i_fps_num / h->param.i_fps_den : 25.0; if( h->param.i_keyint_min == X264_KEYINT_MIN_AUTO ) @@ -686,14 +703,12 @@ if( h->param.analyse.i_me_method < X264_ME_DIA \|\| h->param.analyse.i_me_method > X264_ME_TESA ) h->param.analyse.i_me_method = X264_ME_HEX; - if( h->param.analyse.i_me_range < 4 ) - h->param.analyse.i_me_range = 4; + h->param.analyse.i_me_range = x264_clip3( h->param.analyse.i_me_range, 4, 1024 ); if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX ) h->param.analyse.i_me_range = 16; if( h->param.analyse.i_me_method == X264_ME_TESA && (h->mb.b_lossless \|\| h->param.analyse.i_subpel_refine <= 1) ) h->param.analyse.i_me_method = X264_ME_ESA; - h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 10 ); h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1; h->param.analyse.inter &= X264_ANALYSE_PSUB16x16\|X264_ANALYSE_PSUB8x8\|X264_ANALYSE_BSUB16x16\| X264_ANALYSE_I4x4\|X264_ANALYSE_I8x8; @@ -707,33 +722,57 @@ } h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); + h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); + h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 ); + if( h->param.rc.f_aq_strength == 0 ) + h->param.rc.i_aq_mode = 0; + + if( h->param.i_log_level < X264_LOG_INFO ) + { + h->param.analyse.b_psnr = 0; + h->param.analyse.b_ssim = 0; + } + / Warn users trying to measure PSNR/SSIM with psy opts on. / + if( b_open && (h->param.analyse.b_psnr \|\| h->param.analyse.b_ssim) ) + { + char s = NULL; + + if( h->param.analyse.b_psy ) + { + s = h->param.analyse.b_psnr ? "psnr" : "ssim"; + x264_log( h, X264_LOG_WARNING, "--%s used with psy on: results will be invalid!\n", s ); + } + else if( !h->param.rc.i_aq_mode && h->param.analyse.b_ssim ) + { + x264_log( h, X264_LOG_WARNING, "--ssim used with AQ off: results will be invalid!\n" ); + s = "ssim"; + } + else if( h->param.rc.i_aq_mode && h->param.analyse.b_psnr ) + { + x264_log( h, X264_LOG_WARNING, "--psnr used with AQ on: results will be invalid!\n" ); + s = "psnr"; + } + if( s ) + x264_log( h, X264_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s ); + } + if( !h->param.analyse.b_psy ) { h->param.analyse.f_psy_rd = 0; h->param.analyse.f_psy_trellis = 0; } - if( !h->param.analyse.i_trellis ) - h->param.analyse.f_psy_trellis = 0; h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 ); h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 ); - if( h->param.analyse.i_subpel_refine < 6 ) - h->param.analyse.f_psy_rd = 0; - h->mb.i_psy_rd = FIX8( h->param.analyse.f_psy_rd ); + h->mb.i_psy_rd = h->param.analyse.i_subpel_refine >= 6 ? FIX8( h->param.analyse.f_psy_rd ) : 0; + h->mb.i_psy_trellis = h->param.analyse.i_trellis ? FIX8( h->param.analyse.f_psy_trellis / 4 ) : 0; /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality / / so we lower the chroma QP offset to compensate / - / This can be triggered repeatedly on multiple calls to parameter_validate, but since encoding - * uses the pps chroma qp offset not the param chroma qp offset, this is not a problem. / - if( h->mb.i_psy_rd ) + if( b_open && h->mb.i_psy_rd ) h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2; - h->mb.i_psy_trellis = FIX8( h->param.analyse.f_psy_trellis / 4 ); / Psy trellis has a similar effect. / - if( h->mb.i_psy_trellis ) + if( b_open && h->mb.i_psy_trellis ) h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2; h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); - h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); - h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 ); - if( h->param.rc.f_aq_strength == 0 ) - h->param.rc.i_aq_mode = 0; / MB-tree requires AQ to be on, even if the strength is zero. / if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree ) { @@ -768,12 +807,27 @@ } } if( h->param.analyse.i_mv_range <= 0 ) - h->param.analyse.i_mv_range = l->mv_range >> h->param.b_interlaced; + h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED; else - h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> h->param.b_interlaced); + h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED); } h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); + + if( PARAM_INTERLACED ) + { + if( h->param.analyse.i_me_method >= X264_ME_ESA ) + { + x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" ); + h->param.analyse.i_me_method = X264_ME_UMH; + } + if( h->param.analyse.i_weighted_pred > 0 ) + { + x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" ); + h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE; + } + } + if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy ) h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE; @@ -800,6 +854,8 @@ h->param.analyse.i_mv_range_thread = r2; } + if( h->param.rc.f_rate_tolerance < 0 ) + h->param.rc.f_rate_tolerance = 0; if( h->param.rc.f_qblur < 0 ) h->param.rc.f_qblur = 0; if( h->param.rc.f_complexity_blur < 0 ) @@ -807,15 +863,11 @@ h->param.i_sps_id &= 31; - if( h->param.i_log_level < X264_LOG_INFO ) - { - h->param.analyse.b_psnr = 0; - h->param.analyse.b_ssim = 0; - } - - if( h->param.b_interlaced ) + if( PARAM_INTERLACED ) h->param.b_pic_struct = 1; + h->param.i_nal_hrd = x264_clip3( h->param.i_nal_hrd, X264_NAL_HRD_NONE, X264_NAL_HRD_CBR ); + if( h->param.i_nal_hrd && !h->param.rc.i_vbv_buffer_size ) { x264_log( h, X264_LOG_WARNING, "NAL HRD parameters require VBV parameters\n" ); @@ -843,8 +895,12 @@ BOOLIFY( b_repeat_headers ); BOOLIFY( b_annexb ); BOOLIFY( b_vfr_input ); + BOOLIFY( b_pulldown ); + BOOLIFY( b_tff ); BOOLIFY( b_pic_struct ); BOOLIFY( b_fake_interlaced ); + BOOLIFY( b_open_gop ); + BOOLIFY( b_bluray_compat ); BOOLIFY( analyse.b_transform_8x8 ); BOOLIFY( analyse.b_weighted_bipred ); BOOLIFY( analyse.b_chroma_me ); @@ -937,7 +993,7 @@ goto fail; } - if( x264_validate_parameters( h ) < 0 ) + if( x264_validate_parameters( h, 1 ) < 0 ) goto fail; if( h->param.psz_cqm_file ) @@ -981,6 +1037,10 @@ h->mb.i_mb_width = h->sps->i_mb_width; h->mb.i_mb_height = h->sps->i_mb_height; h->mb.i_mb_count = h->mb.i_mb_width h->mb.i_mb_height; + /* Adaptive MBAFF and subme 0 are not supported as we require halving motion + * vectors during prediction, resulting in hpel mvs. + * The chosen solution is to make MBAFF non-adaptive in this case. / + h->mb.b_adaptive_mbaff = PARAM_INTERLACED && h->param.analyse.i_subpel_refine; / Init frames. / if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && !h->param.rc.b_stat_read ) @@ -1032,14 +1092,17 @@ x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); - if( !h->param.b_cabac ) - x264_init_vlc_tables(); + if( h->param.b_cabac ) + x264_cabac_init(); + else + x264_cavlc_init(); x264_pixel_init( h->param.cpu, &h->pixf ); x264_dct_init( h->param.cpu, &h->dctf ); - x264_zigzag_init( h->param.cpu, &h->zigzagf, h->param.b_interlaced ); + x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced ); + memcpy( &h->zigzagf, PARAM_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); x264_mc_init( h->param.cpu, &h->mc ); x264_quant_init( h, h->param.cpu, &h->quantf ); - x264_deblock_init( h->param.cpu, &h->loopf ); + x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED ); x264_bitstream_init( h->param.cpu, &h->bsf ); x264_dct_init_weights(); @@ -1065,11 +1128,15 @@ p += sprintf( p, " none!" ); x264_log( h, X264_LOG_INFO, "%s\n", buf ); + float logs = x264_analyse_prepare_costs( h ); + if( !logs ) + goto fail; for( qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ ) - if( x264_analyse_init_costs( h, qp ) ) + if( x264_analyse_init_costs( h, logs, qp ) ) goto fail; - if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) ) + if( x264_analyse_init_costs( h, logs, X264_LOOKAHEAD_QP ) ) goto fail; + x264_free( logs ); static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 }; /* Checks for known miscompilation issues. / @@ -1096,8 +1163,8 @@ ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min ) : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor ))); - CHECKED_MALLOC( h->nal_buffer, h->out.i_bitstream * 3/2 + 4 ); h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4; + CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size ); if( h->param.i_threads > 1 && x264_threadpool_init( &h->threadpool, h->param.i_threads, (void)x264_encoder_thread_init, h ) ) @@ -1248,27 +1315,22 @@ if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 && param->rc.i_vbv_max_bitrate > 0 && param->rc.i_vbv_buffer_size > 0 ) { + rc_reconfig \|= h->param.rc.i_vbv_max_bitrate != param->rc.i_vbv_max_bitrate; + rc_reconfig \|= h->param.rc.i_vbv_buffer_size != param->rc.i_vbv_buffer_size; + rc_reconfig \|= h->param.rc.i_bitrate != param->rc.i_bitrate; COPY( rc.i_vbv_max_bitrate ); COPY( rc.i_vbv_buffer_size ); COPY( rc.i_bitrate ); - rc_reconfig = 1; } - if( h->param.rc.f_rf_constant != param->rc.f_rf_constant ) - { - COPY( rc.f_rf_constant ); - rc_reconfig = 1; - } - if( h->param.rc.f_rf_constant_max != param->rc.f_rf_constant_max ) - { - COPY( rc.f_rf_constant_max ); - rc_reconfig = 1; - } - + rc_reconfig \|= h->param.rc.f_rf_constant != param->rc.f_rf_constant; + rc_reconfig \|= h->param.rc.f_rf_constant_max != param->rc.f_rf_constant_max; + COPY( rc.f_rf_constant ); + COPY( rc.f_rf_constant_max ); #undef COPY mbcmp_init( h ); - int ret = x264_validate_parameters( h ); + int ret = x264_validate_parameters( h, 0 ); / Supported reconfiguration options (1-pass only): * vbv-maxrate @@ -1347,9 +1409,11 @@ nal_size += h->out.nal[i].i_payload; /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. / - if( h->nal_buffer_size < nal_size 3/2 + h->out.i_nal * 4 ) + int necessary_size = nal_size * 3/2 + h->out.i_nal * 4; + if( h->nal_buffer_size < necessary_size ) { - uint8_t buf = x264_malloc( nal_size 2 + h->out.i_nal * 4 ); + h->nal_buffer_size = necessary_size * 2; + uint8_t buf = x264_malloc( h->nal_buffer_size ); if( !buf ) return -1; if( previous_nal_size ) @@ -1404,6 +1468,8 @@ return -1; frame_size = x264_encoder_encapsulate_nals( h, 0 ); + if( frame_size < 0 ) + return -1; / now set output/ pi_nal = h->out.i_nal; @@ -1489,7 +1555,7 @@ // and duplicates of that frame. h->fenc->i_lines_weighted = 0; - for( int i_ref = 0; i_ref < (h->i_ref[0] << h->sh.b_mbaff); i_ref++ ) + for( int i_ref = 0; i_ref < (h->i_ref[0] << SLICE_MBAFF); i_ref++ ) for( int i = 0; i < 3; i++ ) h->sh.weight[i_ref][i].weightfn = NULL; @@ -1497,7 +1563,7 @@ if( h->sh.i_type != SLICE_TYPE_P \|\| h->param.analyse.i_weighted_pred <= 0 ) return; - int i_padv = PADV << h->param.b_interlaced; + int i_padv = PADV << PARAM_INTERLACED; int denom = -1; int weightplane[2] = { 0, 0 }; int buffer_next = 0; @@ -1628,6 +1694,10 @@ h->i_ref[0] = X264_MIN( h->i_ref[0], h->frames.i_max_ref0 ); h->i_ref[0] = X264_MIN( h->i_ref[0], h->param.i_frame_reference ); // if reconfig() has lowered the limit + /* For Blu-ray compliance, don't reference frames outside of the minigop. / + if( IS_X264_TYPE_B( h->fenc->i_type ) && h->param.b_bluray_compat ) + h->i_ref[0] = X264_MIN( h->i_ref[0], IS_X264_TYPE_B( h->fref[0][0]->i_type ) + 1 ); + / add duplicates / if( h->fenc->i_type == X264_TYPE_P ) { @@ -1676,24 +1746,37 @@ int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_end = mb_y == h->i_threadslice_end; int b_measure_quality = 1; - int min_y = mb_y - (1 << h->sh.b_mbaff); + int min_y = mb_y - (1 << SLICE_MBAFF); int b_start = min_y == h->i_threadslice_start; - int max_y = b_end ? h->i_threadslice_end : mb_y; + / Even in interlaced mode, deblocking never modifies more than 4 pixels + * above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. / + int minpix_y = min_y16 - 4 * !b_start; + int maxpix_y = mb_y16 - 4 !b_end; b_deblock &= b_hpel \|\| h->param.psz_dump_yuv; if( h->param.b_sliced_threads && b_start && min_y && !b_inloop ) { b_deblock = 0; /* We already deblocked on the inloop pass. / b_measure_quality = 0; / We already measured quality on the inloop pass. / } - if( mb_y & h->sh.b_mbaff ) + if( mb_y & SLICE_MBAFF ) return; if( min_y < h->i_threadslice_start ) return; if( b_deblock ) - for( int y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) ) + for( int y = min_y; y < mb_y; y += (1 << SLICE_MBAFF) ) x264_frame_deblock_row( h, y ); + / FIXME: Prediction requires different borders for interlaced/progressive mc, + * but the actual image data is equivalent. For now, maintain this + * consistency by copying deblocked pixels between planes. / + if( PARAM_INTERLACED ) + for( int p = 0; p < 2; p++ ) + for( int i = minpix_y>>p; i < maxpix_y>>p; i++ ) + memcpy( h->fdec->plane_fld[p] + ih->fdec->i_stride[p], + h->fdec->plane[p] + ih->fdec->i_stride[p], + h->mb.i_mb_width16sizeof(pixel) ); + if( b_hpel ) { int end = mb_y == h->mb.i_mb_height; @@ -1705,25 +1788,30 @@ } } - if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref ) - x264_frame_cond_broadcast( h->fdec, mb_y16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) ); + if( SLICE_MBAFF ) + for( int i = 0; i < 2; i++ ) + { + XCHG( pixel , h->intra_border_backup[0][i], h->intra_border_backup[3][i] ); + XCHG( pixel , h->intra_border_backup[1][i], h->intra_border_backup[4][i] ); + } - min_y = min_y16 - 8 !b_start; - max_y = b_end ? X264_MIN( h->i_threadslice_end16 , h->param.i_height ) : mb_y16 - 8; + if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref ) + x264_frame_cond_broadcast( h->fdec, mb_y16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << SLICE_MBAFF)) ); if( b_measure_quality ) { + maxpix_y = X264_MIN( maxpix_y, h->param.i_height ); if( h->param.analyse.b_psnr ) { uint64_t ssd_y = x264_pixel_ssd_wxh( &h->pixf, - h->fdec->plane[0] + min_y h->fdec->i_stride[0], h->fdec->i_stride[0], - h->fenc->plane[0] + min_y * h->fenc->i_stride[0], h->fenc->i_stride[0], - h->param.i_width, max_y-min_y ); + h->fdec->plane[0] + minpix_y * h->fdec->i_stride[0], h->fdec->i_stride[0], + h->fenc->plane[0] + minpix_y * h->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width, maxpix_y-minpix_y ); uint64_t ssd_u, ssd_v; x264_pixel_ssd_nv12( &h->pixf, - h->fdec->plane[1] + (min_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], - h->fenc->plane[1] + (min_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], - h->param.i_width>>1, (max_y-min_y)>>1, &ssd_u, &ssd_v ); + h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], + h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], + h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v ); h->stat.frame.i_ssd[0] += ssd_y; h->stat.frame.i_ssd[1] += ssd_u; h->stat.frame.i_ssd[2] += ssd_v; @@ -1734,12 +1822,12 @@ x264_emms(); /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks, * and overlap by 4 / - min_y += b_start ? 2 : -6; + minpix_y += b_start ? 2 : -6; h->stat.frame.f_ssim += x264_pixel_ssim_wxh( &h->pixf, - h->fdec->plane[0] + 2+min_yh->fdec->i_stride[0], h->fdec->i_stride[0], - h->fenc->plane[0] + 2+min_yh->fenc->i_stride[0], h->fenc->i_stride[0], - h->param.i_width-2, max_y-min_y, h->scratch_buffer ); + h->fdec->plane[0] + 2+minpix_yh->fdec->i_stride[0], h->fdec->i_stride[0], + h->fenc->plane[0] + 2+minpix_yh->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width-2, maxpix_y-minpix_y, h->scratch_buffer ); } } } @@ -1842,12 +1930,18 @@ } } + if( h->fenc->i_type == X264_TYPE_BREF && h->param.b_bluray_compat && h->sh.i_mmco_command_count ) + { + h->b_sh_backup = 1; + h->sh_backup = h->sh; + } + h->fdec->i_frame_num = h->sh.i_frame_num; if( h->sps->i_poc_type == 0 ) { h->sh.i_poc = h->fdec->i_poc; - if( h->param.b_interlaced ) + if( PARAM_INTERLACED ) { h->sh.i_delta_poc_bottom = h->param.b_tff ? 1 : -1; h->sh.i_poc += h->sh.i_delta_poc_bottom == -1; @@ -1885,6 +1979,7 @@ other inaccuracies. / int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 1 + h->param.b_cabac + 5; int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)8 : 0; + int back_up_bitstream = slice_max_size \|\| (!h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH); int starting_bits = bs_pos(&h->out.bs); int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_hpel = h->fdec->b_kept_as_ref; @@ -1923,53 +2018,78 @@ last_emu_check = h->out.bs.p; h->mb.i_last_qp = h->sh.i_qp; h->mb.i_last_dqp = 0; + h->mb.field_decoding_flag = 0; i_mb_y = h->sh.i_first_mb / h->mb.i_mb_width; i_mb_x = h->sh.i_first_mb % h->mb.i_mb_width; i_skip = 0; - while( (mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width) <= h->sh.i_last_mb ) + while( 1 ) { + mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width; int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac); - if( x264_bitstream_check_buffer( h ) ) - return -1; - - if( slice_max_size ) + if( !(i_mb_y & SLICE_MBAFF) ) { - mv_bits_bak = h->stat.frame.i_mv_bits; - tex_bits_bak = h->stat.frame.i_tex_bits; - /* We don't need the contexts because flushing the CABAC encoder has no context - * dependency and macroblocks are only re-encoded in the case where a slice is - * ended (and thus the content of all contexts are thrown away). / - if( h->param.b_cabac ) - { - memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) ); - / x264's CABAC writer modifies the previous byte during carry, so it has to be - * backed up. / - cabac_prevbyte_bak = h->cabac.p[-1]; - } - else + if( x264_bitstream_check_buffer( h ) ) + return -1; + + if( back_up_bitstream ) { - bs_bak = h->out.bs; - i_skip_bak = i_skip; + mv_bits_bak = h->stat.frame.i_mv_bits; + tex_bits_bak = h->stat.frame.i_tex_bits; + / We don't need the contexts because flushing the CABAC encoder has no context + * dependency and macroblocks are only re-encoded in the case where a slice is + * ended (and thus the content of all contexts are thrown away). / + if( h->param.b_cabac ) + { + memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) ); + / x264's CABAC writer modifies the previous byte during carry, so it has to be + * backed up. / + cabac_prevbyte_bak = h->cabac.p[-1]; + } + else + { + bs_bak = h->out.bs; + i_skip_bak = i_skip; + } } } if( i_mb_x == 0 && !h->mb.b_reencode_mb ) x264_fdec_filter_row( h, i_mb_y, 1 ); + if( PARAM_INTERLACED ) + { + if( h->mb.b_adaptive_mbaff ) + { + if( !(i_mb_y&1) ) + { + / FIXME: VSAD is fast but fairly poor at choosing the best interlace type. / + h->mb.b_interlaced = x264_field_vsad( h, i_mb_x, i_mb_y ); + memcpy( &h->zigzagf, MB_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); + if( !MB_INTERLACED && (i_mb_y+2) == h->mb.i_mb_height ) + x264_expand_border_mbpair( h, i_mb_x, i_mb_y ); + } + } + h->mb.field[mb_xy] = MB_INTERLACED; + } + / load cache / - x264_macroblock_cache_load( h, i_mb_x, i_mb_y ); + if( SLICE_MBAFF ) + x264_macroblock_cache_load_interlaced( h, i_mb_x, i_mb_y ); + else + x264_macroblock_cache_load_progressive( h, i_mb_x, i_mb_y ); x264_macroblock_analyse( h ); / encode this macroblock -> be careful it can change the mb type to P_SKIP if needed / +reencode: x264_macroblock_encode( h ); if( h->param.b_cabac ) { - if( mb_xy > h->sh.i_first_mb && !(h->sh.b_mbaff && (i_mb_y&1)) ) + if( mb_xy > h->sh.i_first_mb && !(SLICE_MBAFF && (i_mb_y&1)) ) x264_cabac_encode_terminal( &h->cabac ); if( IS_SKIP( h->mb.i_type ) ) @@ -1993,6 +2113,19 @@ i_skip = 0; } x264_macroblock_write_cavlc( h ); + / If there was a CAVLC level code overflow, try again at a higher QP. / + if( h->mb.b_overflow ) + { + h->mb.i_chroma_qp = h->chroma_qp_table[++h->mb.i_qp]; + h->mb.i_skip_intra = 0; + h->mb.b_skip_mc = 0; + h->mb.b_overflow = 0; + h->out.bs = bs_bak; + i_skip = i_skip_bak; + h->stat.frame.i_mv_bits = mv_bits_bak; + h->stat.frame.i_tex_bits = tex_bits_bak; + goto reencode; + } } } @@ -2030,7 +2163,16 @@ i_skip = i_skip_bak; } h->mb.b_reencode_mb = 1; - h->sh.i_last_mb = mb_xy-1; + if( SLICE_MBAFF ) + { + // set to bottom of previous mbpair + if( i_mb_x ) + h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride(!(i_mb_y&1)); + else + h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))h->mb.i_mb_stride + h->mb.i_mb_width - 1; + } + else + h->sh.i_last_mb = mb_xy-1; break; } else @@ -2055,9 +2197,10 @@ h->stat.frame.i_mb_count[h->mb.i_type]++; int b_intra = IS_INTRA( h->mb.i_type ); + int b_skip = IS_SKIP( h->mb.i_type ); if( h->param.i_log_level >= X264_LOG_INFO \|\| h->param.rc.b_stat_write ) { - if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) ) + if( !b_intra && !b_skip && !IS_DIRECT( h->mb.i_type ) ) { if( h->mb.i_partition != D_8x8 ) h->stat.frame.i_mb_partition[h->mb.i_partition] += 4; @@ -2102,24 +2245,19 @@ h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++; h->stat.frame.i_mb_pred_mode[3][x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode]]++; } + h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED; } / calculate deblock strength values (actual deblocking is done per-row along with hpel) / if( b_deblock ) - { - int mvy_limit = 4 >> h->sh.b_mbaff; - uint8_t (bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; - x264_macroblock_cache_load_deblock( h ); - if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) ) - memset( bs, 3, 244sizeof(uint8_t) ); - else - h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, - bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B ); - } + x264_macroblock_deblock_strength( h ); x264_ratecontrol_mb( h, mb_size ); - if( h->sh.b_mbaff ) + if( mb_xy == h->sh.i_last_mb ) + break; + + if( SLICE_MBAFF ) { i_mb_x += i_mb_y & 1; i_mb_y ^= i_mb_x < h->mb.i_mb_width; @@ -2179,6 +2317,7 @@ memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) ); dst->param = src->param; dst->stat = src->stat; + dst->pixf = src->pixf; } static void x264_thread_sync_stat( x264_t dst, x264_t src ) @@ -2202,15 +2341,28 @@ / init stats / memset( &h->stat.frame, 0, sizeof(h->stat.frame) ); h->mb.b_reencode_mb = 0; - while( h->sh.i_first_mb <= last_thread_mb ) + while( h->sh.i_first_mb + SLICE_MBAFFh->mb.i_mb_stride <= last_thread_mb ) { h->sh.i_last_mb = last_thread_mb; if( h->param.i_slice_max_mbs ) - h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1; + { + if( SLICE_MBAFF ) + { + // convert first to mbaff form, add slice-max-mbs, then convert back to normal form + int last_mbaff = 2(h->sh.i_first_mb % h->mb.i_mb_width) + + h->mb.i_mb_width(h->sh.i_first_mb / h->mb.i_mb_width) + + h->param.i_slice_max_mbs - 1; + int last_x = (last_mbaff % (2h->mb.i_mb_width))/2; + int last_y = (last_mbaff / (2h->mb.i_mb_width))2 + 1; + h->sh.i_last_mb = last_x + h->mb.i_mb_stridelast_y; + } + else + h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1; + } else if( h->param.i_slice_count && !h->param.b_sliced_threads ) { - int height = h->mb.i_mb_height >> h->param.b_interlaced; - int width = h->mb.i_mb_width << h->param.b_interlaced; + int height = h->mb.i_mb_height >> PARAM_INTERLACED; + int width = h->mb.i_mb_width << PARAM_INTERLACED; i_slice_num++; h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1; } @@ -2218,6 +2370,9 @@ if( x264_stack_align( x264_slice_write, h ) ) return (void )-1; h->sh.i_first_mb = h->sh.i_last_mb + 1; + // if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order + if( SLICE_MBAFF && h->sh.i_first_mb % h->mb.i_mb_width ) + h->sh.i_first_mb -= h->mb.i_mb_stride; } #if HAVE_VISUALIZE @@ -2242,9 +2397,9 @@ t->param = h->param; memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) ); } - int height = h->mb.i_mb_height >> h->param.b_interlaced; - t->i_threadslice_start = ((height i + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced; - t->i_threadslice_end = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced; + int height = h->mb.i_mb_height >> PARAM_INTERLACED; + t->i_threadslice_start = ((height * i + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED; + t->i_threadslice_end = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED; t->sh.i_first_mb = t->i_threadslice_start * h->mb.i_mb_width; t->sh.i_last_mb = t->i_threadslice_end * h->mb.i_mb_width - 1; } @@ -2270,7 +2425,7 @@ for( int i = 1; i < h->param.i_threads; i++ ) { x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 ); - if( h->sh.b_mbaff ) + if( SLICE_MBAFF ) x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 ); } @@ -2410,7 +2565,11 @@ if( fenc->i_pic_struct == PIC_STRUCT_AUTO ) { +#if HAVE_INTERLACED int b_interlaced = fenc->param ? fenc->param->b_interlaced : h->param.b_interlaced; +#else + int b_interlaced = 0; +#endif if( b_interlaced ) { int b_tff = fenc->param ? fenc->param->b_tff : h->param.b_tff; @@ -2520,7 +2679,7 @@ i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)/ h->sh.i_type = SLICE_TYPE_I; x264_reference_hierarchy_reset( h ); - if( h->param.i_open_gop ) + if( h->param.b_open_gop ) h->frames.i_poc_last_open_gop = h->fenc->b_keyframe ? h->fenc->i_poc : -1; } else if( h->fenc->i_type == X264_TYPE_P ) @@ -2695,7 +2854,7 @@ if( h->fenc->i_type != X264_TYPE_IDR ) { - int time_to_recovery = h->param.i_open_gop ? 0 : X264_MIN( h->mb.i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe - 1; + int time_to_recovery = h->param.b_open_gop ? 0 : X264_MIN( h->mb.i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe - 1; x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery ); if( x264_nal_end( h ) ) @@ -2723,6 +2882,17 @@ overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1); } + / As required by Blu-ray. / + if( !IS_X264_TYPE_B( h->fenc->i_type ) && h->b_sh_backup ) + { + h->b_sh_backup = 0; + x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); + x264_sei_dec_ref_pic_marking_write( h, &h->out.bs ); + if( x264_nal_end( h ) ) + return -1; + overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal-1); + } + if( h->fenc->b_keyframe && h->param.b_intra_refresh ) h->i_cpb_delay_pir_offset = h->fenc->i_cpb_delay; @@ -2814,6 +2984,8 @@ } int frame_size = x264_encoder_encapsulate_nals( h, 0 ); + if( frame_size < 0 ) + return -1; / Set output picture properties / pic_out->i_type = h->fenc->i_type; @@ -2867,6 +3039,8 @@ if( x264_nal_end( h ) ) return -1; int total_size = x264_encoder_encapsulate_nals( h, h->out.i_nal-1 ); + if( total_size < 0 ) + return -1; frame_size += total_size; filler -= total_size; } @@ -2902,6 +3076,8 @@ for( int i_list = 0; i_list < 2; i_list++ ) for( int i = 0; i < X264_REF_MAX2; i++ ) h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i]; + for( int i = 0; i < 3; i++ ) + h->stat.i_mb_field[i] += h->stat.frame.i_mb_field[i]; if( h->sh.i_type == SLICE_TYPE_P && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE ) { h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn; @@ -3171,15 +3347,30 @@ int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 ) + SUM3b( h->stat.i_mb_count, I_16x16 ); int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM); + int64_t i_skip = SUM3b( h->stat.i_mb_count, P_SKIP ) + + SUM3b( h->stat.i_mb_count, B_SKIP ); const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B]; + int64_t i_mb_count = (int64_t)i_count * h->mb.i_mb_count; + int64_t i_inter = i_mb_count - i_skip - i_intra; const double duration = h->stat.f_frame_duration[SLICE_TYPE_I] + h->stat.f_frame_duration[SLICE_TYPE_P] + h->stat.f_frame_duration[SLICE_TYPE_B]; - int64_t i_mb_count = (int64_t)i_count * h->mb.i_mb_count; float f_bitrate = SUM3(h->stat.i_frame_size) / duration / 125; + if( PARAM_INTERLACED ) + { + char fieldstats = buf; + fieldstats[0] = 0; + if( i_inter ) + fieldstats += sprintf( fieldstats, " inter:%.1f%%", h->stat.i_mb_field[1] 100.0 / i_inter ); + if( i_skip ) + fieldstats += sprintf( fieldstats, " skip:%.1f%%", h->stat.i_mb_field[2] * 100.0 / i_skip ); + x264_log( h, X264_LOG_INFO, "field mbs: intra: %.1f%%%s\n", + h->stat.i_mb_field[0] * 100.0 / i_intra, buf ); + } + if( h->pps->b_transform_8x8_mode ) { buf[0] = 0;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/macroblock.c ^
@@ -273,59 +273,19 @@ h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 ); } -static inline int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp ) -{ - dctcoef out[4]; - idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp ); - return ((ref[0] ^ (out[0]+32)) - \| (ref[1] ^ (out[1]+32)) - \| (ref[2] ^ (out[2]+32)) - \| (ref[3] ^ (out[3]+32))) >> 6; -} - /* Round down coefficients losslessly in DC-only chroma blocks. * Unlike luma blocks, this can't be done with a lookup table or * other shortcut technique because of the interdependencies * between the coefficients due to the chroma DC transform. / -static inline int x264_mb_optimize_chroma_dc( x264_t h, int b_inter, int i_qp, dctcoef dct2x2[4] ) +static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t h, dctcoef dct2x2[4], int dequant_mf[6][16], int i_qp ) { - dctcoef dct2x2_orig[4]; - int coeff, nz; + int dmf = dequant_mf[i_qp%6][0] << i_qp/6; / If the QP is too high, there's no benefit to rounding optimization. / - if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 3264 ) + if( dmf > 3264 ) return 1; - idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); - dct2x2_orig[0] += 32; - dct2x2_orig[1] += 32; - dct2x2_orig[2] += 32; - dct2x2_orig[3] += 32; - - / If the DC coefficients already round to zero, terminate early. / - if( !((dct2x2_orig[0]\|dct2x2_orig[1]\|dct2x2_orig[2]\|dct2x2_orig[3])>>6) ) - return 0; - - / Start with the highest frequency coefficient... is this the best option? / - for( nz = 0, coeff = h->quantf.coeff_last[DCT_CHROMA_DC]( dct2x2 ); coeff >= 0; coeff-- ) - { - int level = dct2x2[coeff]; - int sign = level>>31 \| 1; / dct2x2[coeff] < 0 ? -1 : 1 / - - while( level ) - { - dct2x2[coeff] = level - sign; - if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) - { - nz = 1; - dct2x2[coeff] = level; - break; - } - level -= sign; - } - } - - return nz; + return h->quantf.optimize_chroma_dc( dct2x2, dmf ); } void x264_mb_encode_8x8_chroma( x264_t h, int b_inter, int i_qp ) @@ -370,7 +330,7 @@ if( nz_dc ) { - if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) ) + if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) continue; h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1; zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); @@ -446,7 +406,7 @@ h->mb.cache.non_zero_count[x264_scan8[16+3]+24ch] = 0; if( !nz_dc ) / Whole block is empty / continue; - if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) ) + if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) { h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0; continue; @@ -553,7 +513,7 @@ void x264_predict_lossless_4x4( x264_t h, pixel p_dst, int idx, int i_mode ) { - int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + int stride = h->fenc->i_stride[0] << MB_INTERLACED; pixel p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]4 + block_idx_y[idx]4 * stride; if( i_mode == I_PRED_4x4_V ) @@ -566,7 +526,7 @@ void x264_predict_lossless_8x8( x264_t h, pixel p_dst, int idx, int i_mode, pixel edge[33] ) { - int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + int stride = h->fenc->i_stride[0] << MB_INTERLACED; pixel p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)8 + (idx>>1)8stride; if( i_mode == I_PRED_8x8_V ) @@ -579,7 +539,7 @@ void x264_predict_lossless_16x16( x264_t h, int i_mode ) { - int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + int stride = h->fenc->i_stride[0] << MB_INTERLACED; if( i_mode == I_PRED_16x16_V ) h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 ); else if( i_mode == I_PRED_16x16_H ) @@ -609,13 +569,8 @@ return; } - if( h->sh.b_mbaff - && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride - && IS_SKIP(h->mb.type[h->sh.i_first_mb]) ) + if( !h->mb.b_allow_skip ) { - / The first skip is predicted to be a frame mb pair. - * We don't yet support the aff part of mbaff, so force it to non-skip - * so that we can pick the aff flag. */ b_force_no_skip = 1; if( IS_SKIP(h->mb.i_type) ) {
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/me.c ^
@@ -814,7 +814,7 @@ const uint16_t p_cost_mvy = m->p_cost_mv - m->mvp[1]; const int i_pixel = m->i_pixel; const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; - const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; + const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; ALIGNED_ARRAY_16( pixel, pix,[6418] ); // really 17x17x2, but round up for alignment @@ -951,8 +951,8 @@ pixel pixv = &h->mb.pic.p_fdec[2][4x + 4yFDEC_STRIDE]; int ref0 = h->mb.cache.ref[0][s8]; int ref1 = h->mb.cache.ref[1][s8]; - const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)4 - 2 : 0; - const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)4 - 2 : 0; + const int mv0y_offset = MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)4 - 2 : 0; + const int mv1y_offset = MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)4 - 2 : 0; int stride[2][9]; int bm0x = m0->mv[0]; int bm0y = m0->mv[1]; @@ -965,7 +965,7 @@ /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed / ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] ); / all permutations of an offset in up to 2 of the dimensions / - static const int8_t dia4d[33][4] = + ALIGNED_4( static const int8_t dia4d[33][4] ) = { {0,0,0,0}, {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0}, @@ -1129,14 +1129,13 @@ const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; - const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)4 - 2 : 0; + const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; uint64_t bcost = COST_MAX64; int bmx = m->mv[0]; int bmy = m->mv[1]; int omx, omy, pmx, pmy; - unsigned bsatd; - int satd; + int satd, bsatd; int dir = -2; int i8 = i4>>2; uint16_t amvd; @@ -1227,7 +1226,7 @@ m->mv[0] = bmx; m->mv[1] = bmy; x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) ); - amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33) ); + amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),66), X264_MIN(abs(bmy - m->mvp[1]),66) ); x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd ); h->mb.b_skip_mc = 0; }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/me.h ^
@@ -55,7 +55,8 @@ ALIGNED_4( int16_t mv[2] ); } ALIGNED_16( x264_me_t ); -typedef struct { +typedef struct +{ int sad; int16_t mv[2]; } mvsad_t;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/ratecontrol.c ^
@@ -29,7 +29,6 @@ #define _ISOC99_SOURCE #undef NDEBUG // always check asserts, the speed effect is far too small to disable them -#include <math.h> #include "common/common.h" #include "ratecontrol.h" @@ -63,10 +62,10 @@ typedef struct { - double coeff; - double count; - double decay; - double offset; + float coeff; + float count; + float decay; + float offset; } predictor_t; struct x264_ratecontrol_t @@ -88,7 +87,7 @@ int qp; /* qp for current frame / float qpm; / qp for current macroblock: precise float for AQ / float qpa_rc; / average of macroblocks' qp before aq / - float qpa_aq; / average of macroblocks' qp after aq / + int qpa_aq; / average of macroblocks' qp after aq / float qp_novbv; / QP for the current frame if 1-pass VBV was disabled. / / VBV stuff / @@ -168,8 +167,8 @@ static float rate_estimate_qscale( x264_t h ); static int update_vbv( x264_t h, int bits ); static void update_vbv_plan( x264_t h, int overhead ); -static double predict_size( predictor_t p, double q, double var ); -static void update_predictor( predictor_t p, double q, double var, double bits ); +static float predict_size( predictor_t p, float q, float var ); +static void update_predictor( predictor_t p, float q, float var, float bits ); #define CMP_OPT_FIRST_PASS( opt, param_val )\ {\ @@ -184,13 +183,13 @@ * qp = h.264's quantizer * qscale = linearized quantizer = Lagrange multiplier / -static inline double qp2qscale( double qp ) +static inline float qp2qscale( float qp ) { - return 0.85 pow( 2.0, ( qp - 12.0 ) / 6.0 ); + return 0.85f * powf( 2.0f, ( qp - 12.0f ) / 6.0f ); } -static inline double qscale2qp( double qscale ) +static inline float qscale2qp( float qscale ) { - return 12.0 + 6.0 * log2( qscale/0.85 ); + return 12.0f + 6.0f * log2f( qscale/0.85f ); } /* Texture bitrate is not quite inversely proportional to qscale, @@ -206,32 +205,35 @@ + rce->misc_bits; } -static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t frame, int i ) +static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t frame, int i, int b_store ) { uint32_t sum = sum_ssd; uint32_t ssd = sum_ssd >> 32; - frame->i_pixel_sum[i] += sum; - frame->i_pixel_ssd[i] += ssd; + if( b_store ) + { + frame->i_pixel_sum[i] += sum; + frame->i_pixel_ssd[i] += ssd; + } return ssd - ((uint64_t)sum * sum >> shift); } -static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t h, int mb_x, int mb_y, x264_frame_t frame, int i ) +static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t h, int mb_x, int mb_y, x264_frame_t frame, int i, int field, int b_store ) { int w = i ? 8 : 16; int stride = frame->i_stride[i]; - int offset = h->mb.b_interlaced + int offset = field ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride : 16 * mb_x + w * mb_y * stride; - stride <<= h->mb.b_interlaced; + stride <<= field; if( i ) { ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE8] ); h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride ); - return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1 ) - + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2 ); + return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1, b_store ) + + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2, b_store ); } else - return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, 0 ); + return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, 0, b_store ); } // Find the total AC energy of the block in all planes. @@ -241,8 +243,23 @@ and putting it after floating point ops. As a result, we put the emms at the end of the * function and make sure that its always called before the float math. Noinline makes * sure no reordering goes on. / - uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 ); - var += ac_energy_plane( h, mb_x, mb_y, frame, 1 ); + uint32_t var; + if( h->mb.b_adaptive_mbaff ) + { + / We don't know the super-MB mode we're going to pick yet, so + * simply try both and pick the lower of the two. / + uint32_t var_interlaced, var_progressive; + var_interlaced = ac_energy_plane( h, mb_x, mb_y, frame, 0, 1, 1 ); + var_interlaced += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 1 ); + var_progressive = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 0 ); + var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 0 ); + var = X264_MIN( var_interlaced, var_progressive ); + } + else + { + var = ac_energy_plane( h, mb_x, mb_y, frame, 0, PARAM_INTERLACED, 1 ); + var += ac_energy_plane( h, mb_x, mb_y, frame, 1, PARAM_INTERLACED, 1 ); + } x264_emms(); return var; } @@ -460,6 +477,11 @@ if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 ) { + / We don't support changing the ABR bitrate right now, + so if the stream starts as CBR, keep it CBR. / + if( rc->b_vbv_min_rate ) + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; + if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) ) { h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps; @@ -467,17 +489,10 @@ h->param.rc.i_vbv_buffer_size ); } - / We don't support changing the ABR bitrate right now, - so if the stream starts as CBR, keep it CBR. / - if( rc->b_vbv_min_rate ) - h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; - int vbv_buffer_size = h->param.rc.i_vbv_buffer_size 1000; int vbv_max_bitrate = h->param.rc.i_vbv_max_bitrate * 1000; /* Init HRD / - h->sps->vui.hrd.i_bit_rate_unscaled = vbv_max_bitrate; - h->sps->vui.hrd.i_cpb_size_unscaled = vbv_buffer_size; if( h->param.i_nal_hrd && b_init ) { h->sps->vui.hrd.i_cpb_cnt = 1; @@ -522,7 +537,11 @@ x264_log( h, X264_LOG_WARNING, "VBV parameters cannot be changed when NAL HRD is in use\n" ); return; } + h->sps->vui.hrd.i_bit_rate_unscaled = vbv_max_bitrate; + h->sps->vui.hrd.i_cpb_size_unscaled = vbv_buffer_size; + if( rc->b_vbv_min_rate ) + rc->bitrate = h->param.rc.i_bitrate 1000.; rc->buffer_rate = vbv_max_bitrate / rc->fps; rc->vbv_max_rate = vbv_max_bitrate; rc->buffer_size = vbv_buffer_size; @@ -736,7 +755,8 @@ CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe ); CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid ); CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh ); - CMP_OPT_FIRST_PASS( "open_gop", h->param.i_open_gop ); + CMP_OPT_FIRST_PASS( "open_gop", h->param.b_open_gop ); + CMP_OPT_FIRST_PASS( "bluray_compat", h->param.b_bluray_compat ); if( (p = strstr( opts, "keyint=" )) ) { @@ -1199,6 +1219,8 @@ if( rc->b_vbv ) { memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) ); + memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) ); + memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) ); rc->row_pred = &rc->row_preds[h->sh.i_type]; rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; update_vbv_plan( h, overhead ); @@ -1209,8 +1231,7 @@ int mincr = l->mincr; - /* Blu-ray requires this / - if( l->level_idc == 41 && h->param.i_nal_hrd ) + if( h->param.b_bluray_compat ) mincr = 4; / High 10 doesn't require minCR, so just set the maximum to a large value. / @@ -1237,11 +1258,7 @@ if( h->sh.i_type != SLICE_TYPE_B ) rc->bframes = h->fenc->i_bframes; - if( i_force_qp != X264_QP_AUTO ) - { - q = i_force_qp - 1; - } - else if( rc->b_abr ) + if( rc->b_abr ) { q = qscale2qp( rate_estimate_qscale( h ) ); } @@ -1265,12 +1282,14 @@ q -= 6log2f( zone->f_bitrate_factor ); } } + if( i_force_qp != X264_QP_AUTO ) + q = i_force_qp - 1; q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); rc->qpa_rc = rc->qpa_aq = 0; - rc->qp = x264_clip3( (int)(q + 0.5), 0, QP_MAX ); + rc->qp = x264_clip3( q + 0.5f, 0, QP_MAX ); h->fdec->f_qp_avg_rc = h->fdec->f_qp_avg_aq = rc->qpm = q; @@ -1283,72 +1302,84 @@ rc->last_non_b_pict_type = h->sh.i_type; } -static double predict_row_size( x264_t h, int y, double qp ) +static float predict_row_size( x264_t h, int y, float qscale ) { /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame / x264_ratecontrol_t rc = h->rc; - double pred_s = predict_size( rc->row_pred[0], qp2qscale( qp ), h->fdec->i_row_satd[y] ); - double pred_t = 0; - if( h->sh.i_type == SLICE_TYPE_I \|\| qp >= h->fref[0][0]->f_row_qp[y] ) + float pred_s = predict_size( rc->row_pred[0], qscale, h->fdec->i_row_satd[y] ); + if( h->sh.i_type == SLICE_TYPE_I \|\| qscale >= h->fref[0][0]->f_row_qscale[y] ) { if( h->sh.i_type == SLICE_TYPE_P && h->fref[0][0]->i_type == h->fdec->i_type + && h->fref[0][0]->f_row_qscale[y] > 0 && h->fref[0][0]->i_row_satd[y] > 0 && (abs(h->fref[0][0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2)) { - pred_t = h->fref[0][0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref[0][0]->i_row_satd[y] - * qp2qscale( h->fref[0][0]->f_row_qp[y] ) / qp2qscale( qp ); + float pred_t = h->fref[0][0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref[0][0]->i_row_satd[y] + * h->fref[0][0]->f_row_qscale[y] / qscale; + return (pred_s + pred_t) * 0.5f; } - if( pred_t == 0 ) - pred_t = pred_s; - return (pred_s + pred_t) / 2; + return pred_s; } /* Our QP is lower than the reference! / else { - double pred_intra = predict_size( rc->row_pred[1], qp2qscale( qp ), h->fdec->i_row_satds[0][0][y] ); + float pred_intra = predict_size( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] ); / Sum: better to overestimate than underestimate by using only one of the two predictors. / return pred_intra + pred_s; } } -static double row_bits_so_far( x264_t h, int y ) +static int row_bits_so_far( x264_t h, int y ) { - double bits = 0; + int bits = 0; for( int i = h->i_threadslice_start; i <= y; i++ ) bits += h->fdec->i_row_bits[i]; return bits; } -static double predict_row_size_sum( x264_t h, int y, double qp ) +static float predict_row_size_sum( x264_t h, int y, float qp ) { - double bits = row_bits_so_far(h, y); + float qscale = qp2qscale( qp ); + float bits = row_bits_so_far( h, y ); for( int i = y+1; i < h->i_threadslice_end; i++ ) - bits += predict_row_size( h, i, qp ); + bits += predict_row_size( h, i, qscale ); return bits; } - +/ TODO: + * eliminate all use of qp in row ratecontrol: make it entirely qscale-based. + * make this function stop being needlessly O(N^2) + * update more often than once per row? / void x264_ratecontrol_mb( x264_t h, int bits ) { x264_ratecontrol_t rc = h->rc; const int y = h->mb.i_mb_y; - x264_emms(); - h->fdec->i_row_bits[y] += bits; - rc->qpa_rc += rc->qpm; rc->qpa_aq += h->mb.i_qp; - if( h->mb.i_mb_x != h->mb.i_mb_width - 1 \|\| !rc->b_vbv ) + if( h->mb.i_mb_x != h->mb.i_mb_width - 1 ) + return; + + x264_emms(); + rc->qpa_rc += rc->qpm h->mb.i_mb_width; + + if( !rc->b_vbv ) return; + float qscale = qp2qscale( rc->qpm ); h->fdec->f_row_qp[y] = rc->qpm; + h->fdec->f_row_qscale[y] = qscale; - update_predictor( rc->row_pred[0], qp2qscale( rc->qpm ), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); + update_predictor( rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref[0][0]->f_row_qp[y] ) - update_predictor( rc->row_pred[1], qp2qscale( rc->qpm ), h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); + update_predictor( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); + + /* update ratecontrol per-mbpair in MBAFF / + if( SLICE_MBAFF && !(y&1) ) + return; / tweak quality based on difference from predicted size / if( y < h->i_threadslice_end-1 ) @@ -1359,7 +1390,7 @@ if( rc->rate_factor_max_increment ) qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment ); float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max ); - float step_size = 0.5; + float step_size = 0.5f; / B-frames shouldn't use lower QP than their reference frames. / if( h->sh.i_type == SLICE_TYPE_B ) @@ -1370,7 +1401,7 @@ float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned; float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned; - float max_frame_error = X264_MAX( 0.05, 1.0 / (h->mb.i_mb_height) ); + float max_frame_error = X264_MAX( 0.05f, 1.0f / h->mb.i_mb_height ); float size_of_other_slices = 0; if( h->param.b_sliced_threads ) { @@ -1387,22 +1418,22 @@ / More threads means we have to be more cautious in letting ratecontrol use up extra bits. / float rc_tol = buffer_left_planned / h->param.i_threads rc->rate_tolerance; - int b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; + float b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat / / area at the top of the frame was measured inaccurately. / - if( row_bits_so_far( h, y ) < 0.05 slice_size_planned ) + if( row_bits_so_far( h, y ) < 0.05f * slice_size_planned ) return; if( h->sh.i_type != SLICE_TYPE_I ) - rc_tol /= 2; + rc_tol = 0.5f; if( !rc->b_vbv_min_rate ) qp_min = X264_MAX( qp_min, rc->qp_novbv ); while( rc->qpm < qp_max && ((b1 > rc->frame_size_planned + rc_tol) \|\| - (rc->buffer_fill - b1 < buffer_left_planned 0.5) \|\| + (rc->buffer_fill - b1 < buffer_left_planned * 0.5f) \|\| (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) ) { rc->qpm += step_size; @@ -1411,8 +1442,8 @@ while( rc->qpm > qp_min && (rc->qpm > h->fdec->f_row_qp[0] \|\| rc->single_frame_vbv) - && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp) - \|\| b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) ) + && ((b1 < rc->frame_size_planned * 0.8f && rc->qpm <= prev_row_qp) + \|\| b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1f) ) { rc->qpm -= step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; @@ -1427,14 +1458,16 @@ b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } - h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm ); + h->rc->frame_size_estimated = b1 - size_of_other_slices; } + else + h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm ); } int x264_ratecontrol_qp( x264_t h ) { x264_emms(); - return x264_clip3( h->rc->qpm + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + return x264_clip3( h->rc->qpm + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } int x264_ratecontrol_mb_qp( x264_t h ) @@ -1450,7 +1483,7 @@ qp_offset = (QP_MAX - qp) / (QP_MAX - QP_MAX_SPEC); qp += qp_offset; } - return x264_clip3( qp + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + return x264_clip3( qp + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } / In 2pass, force the same frame types as in the 1st pass / @@ -1526,7 +1559,7 @@ h->stat.frame.i_mb_count_p += mbs[i]; h->fdec->f_qp_avg_rc = rc->qpa_rc /= h->mb.i_mb_count; - h->fdec->f_qp_avg_aq = rc->qpa_aq /= h->mb.i_mb_count; + h->fdec->f_qp_avg_aq = (float)rc->qpa_aq / h->mb.i_mb_count; if( h->param.rc.b_stat_write ) { @@ -1558,7 +1591,7 @@ for( int i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref[0]); i++ ) { int refcount = use_old_stats ? rc->rce->refcount[i] - : h->param.b_interlaced ? h->stat.frame.i_mb_count_ref[0][i2] + : PARAM_INTERLACED ? h->stat.frame.i_mb_count_ref[0][i2] + h->stat.frame.i_mb_count_ref[0][i2+1] : h->stat.frame.i_mb_count_ref[0][i]; if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 ) @@ -1689,7 +1722,14 @@ { x264_ratecontrol_t rcc= h->rc; x264_zone_t zone = get_zone( h, frame_num ); - double q = pow( rce->blurred_complexity, 1 - rcc->qcompress ); + double q; + if( h->param.rc.b_mb_tree ) + { + double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; + q = pow( BASE_FRAME_DURATION / CLIP_DURATION(rce->i_duration * timescale), 1 - h->param.rc.f_qcompress ); + } + else + q = pow( rce->blurred_complexity, 1 - rcc->qcompress ); // avoid NaN's in the rc_eq if( !isfinite(q) \|\| rce->tex_bits + rce->mv_bits == 0 ) @@ -1712,10 +1752,11 @@ return q; } -static double get_diff_limited_q(x264_t h, ratecontrol_entry_t rce, double q) +static double get_diff_limited_q(x264_t h, ratecontrol_entry_t rce, double q, int frame_num) { x264_ratecontrol_t rcc = h->rc; const int pict_type = rce->pict_type; + x264_zone_t zone = get_zone( h, frame_num ); // force I/B quants as a function of P quants const double last_p_q = rcc->last_qscale_for[SLICE_TYPE_P]; @@ -1776,23 +1817,32 @@ rcc->accum_p_qp = mask * (qscale2qp( q ) + rcc->accum_p_qp); rcc->accum_p_norm = mask * (1 + rcc->accum_p_norm); } + + if( zone ) + { + if( zone->b_force_qp ) + q = qp2qscale( zone->i_qp ); + else + q /= zone->f_bitrate_factor; + } + return q; } -static double predict_size( predictor_t p, double q, double var ) +static float predict_size( predictor_t p, float q, float var ) { - return (p->coeffvar + p->offset) / (qp->count); + return (p->coeffvar + p->offset) / (qp->count); } -static void update_predictor( predictor_t p, double q, double var, double bits ) +static void update_predictor( predictor_t p, float q, float var, float bits ) { - const double range = 1.5; + float range = 1.5; if( var < 10 ) return; - double old_coeff = p->coeff / p->count; - double new_coeff = bitsq / var; - double new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeffrange ); - double new_offset = bitsq - new_coeff_clipped var; + float old_coeff = p->coeff / p->count; + float new_coeff = bitsq / var; + float new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeffrange ); + float new_offset = bitsq - new_coeff_clipped var; if( new_offset >= 0 ) new_coeff = new_coeff_clipped; else @@ -1829,7 +1879,8 @@ if( h->sps->vui.hrd.b_cbr_hrd && rct->buffer_fill_final > buffer_size ) { - filler = ceil( (rct->buffer_fill_final - buffer_size) / (8. * h->sps->vui.i_time_scale) ); + int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8; + filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale; bits = X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8; rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale; } @@ -1871,7 +1922,7 @@ double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; - bits = X264_MAX(bits, t->rc->frame_size_estimated); + bits = X264_MAX(bits, t->rc->frame_size_estimated); rcc->buffer_fill -= bits; rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 ); rcc->buffer_fill += t->rc->buffer_rate; @@ -2084,6 +2135,9 @@ rcc->frame_size_planned = qscale2bits( &rce, qp2qscale( q ) ); else rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, qp2qscale( q ), h->fref[1][h->i_ref[1]-1]->i_satd ); + /* Limit planned size by MinCR / + if( rcc->b_vbv ) + rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum ); h->rc->frame_size_estimated = rcc->frame_size_planned; / For row SATDs / @@ -2114,7 +2168,7 @@ double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; - bits = X264_MAX(bits, t->rc->frame_size_estimated); + bits = X264_MAX(bits, t->rc->frame_size_estimated); predicted_bits += (int64_t)bits; } } @@ -2199,6 +2253,7 @@ rce.s_count = 0; rce.qscale = 1; rce.pict_type = pict_type; + rce.i_duration = h->fenc->i_duration; if( h->param.rc.i_rc_method == X264_RC_CRF ) { @@ -2274,6 +2329,9 @@ / Always use up the whole VBV in this case. / if( rcc->single_frame_vbv ) rcc->frame_size_planned = rcc->buffer_rate; + / Limit planned size by MinCR / + if( rcc->b_vbv ) + rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum ); h->rc->frame_size_estimated = rcc->frame_size_planned; return q; } @@ -2384,13 +2442,14 @@ COPY(prev_zone); COPY(qpbuf_pos); / these vars can be updated by x264_ratecontrol_init_reconfigurable / - COPY(buffer_rate); + COPY(bitrate); COPY(buffer_size); + COPY(buffer_rate); + COPY(vbv_max_rate); COPY(single_frame_vbv); COPY(cbr_decay); - COPY(b_vbv_min_rate); COPY(rate_factor_constant); - COPY(bitrate); + COPY(rate_factor_max_increment); #undef COPY } if( cur != next ) @@ -2651,14 +2710,14 @@ / find qscale / for( int i = 0; i < rcc->num_entries; i++ ) { - qscale[i] = get_qscale( h, &rcc->entry[i], rate_factor, i ); + qscale[i] = get_qscale( h, &rcc->entry[i], rate_factor, -1 ); rcc->last_qscale_for[rcc->entry[i].pict_type] = qscale[i]; } / fixed I/B qscale relative to P */ for( int i = rcc->num_entries-1; i >= 0; i-- ) { - qscale[i] = get_diff_limited_q( h, &rcc->entry[i], qscale[i] ); + qscale[i] = get_diff_limited_q( h, &rcc->entry[i], qscale[i], i ); assert(qscale[i] >= 0); }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/rdo.c ^
@@ -205,8 +205,8 @@ /* Really should be 15 bytes, but rounding up a byte saves some * instructions and is faster, and copying extra data doesn't hurt. / - COPY_CABAC_PART( significant_coeff_flag_offset[h->mb.b_interlaced][cat], 16 ); - COPY_CABAC_PART( last_coeff_flag_offset[h->mb.b_interlaced][cat], 16 ); + COPY_CABAC_PART( significant_coeff_flag_offset[MB_INTERLACED][cat], 16 ); + COPY_CABAC_PART( last_coeff_flag_offset[MB_INTERLACED][cat], 16 ); COPY_CABAC_PART( coeff_abs_level_m1_offset[cat], 10 ); cb->f8_bits_encoded = 0; } @@ -387,7 +387,8 @@ } } -typedef struct { +typedef struct +{ int64_t score; int level_idx; // index into level_tree[] uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1 @@ -425,7 +426,7 @@ trellis_node_t nodes_cur = nodes[0]; trellis_node_t nodes_prev = nodes[1]; trellis_node_t bnode; - const int b_interlaced = h->mb.b_interlaced; + const int b_interlaced = MB_INTERLACED; uint8_t cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; uint8_t cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; const int f = 1 << 15; // no deadzone @@ -435,7 +436,8 @@ // (# of coefs) * (# of ctx) * (# of levels tried) = 1024 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough, // but it takes more time to remove dead states than you gain in reduced memory. - struct { + struct + { uint16_t abs_level; uint16_t next; } level_tree[6482]; @@ -839,12 +841,12 @@ if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], + NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, 0 ); return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], + NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, 0, 0 ); } @@ -855,14 +857,14 @@ if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - x264_dct4_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan4[h->mb.b_interlaced], + x264_dct4_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx ); return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - x264_dct4_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan4[h->mb.b_interlaced], + x264_dct4_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx, 0 ); } @@ -873,8 +875,8 @@ { return quant_trellis_cabac( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], - x264_dct8_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan8[h->mb.b_interlaced], + x264_dct8_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan8[MB_INTERLACED], DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx ); } @@ -884,8 +886,8 @@ { int nz = quant_trellis_cavlc( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], - x264_dct8_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan8[h->mb.b_interlaced], + x264_dct8_weight2_zigzag[MB_INTERLACED], + x264_zigzag_scan8[MB_INTERLACED], DCT_LUMA_4x4, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 16, idx4+i, 1 ); / Set up nonzero count for future calls / h->mb.cache.non_zero_count[x264_scan8[idx4+i]] = nz;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/set.c ^
@@ -24,8 +24,6 @@ * For more information, contact us at licensing@x264.com. ****************************************************************************/ -#include <math.h> - #include "common/common.h" #include "set.h" @@ -207,18 +205,18 @@ sps->vui.i_sar_height= param->vui.i_sar_height; } - sps->vui.b_overscan_info_present = ( param->vui.i_overscan ? 1 : 0 ); + sps->vui.b_overscan_info_present = param->vui.i_overscan > 0 && param->vui.i_overscan <= 2; if( sps->vui.b_overscan_info_present ) sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 ); sps->vui.b_signal_type_present = 0; - sps->vui.i_vidformat = ( param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 ); + sps->vui.i_vidformat = ( param->vui.i_vidformat >= 0 && param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 ); sps->vui.b_fullrange = ( param->vui.b_fullrange ? 1 : 0 ); sps->vui.b_color_description_present = 0; - sps->vui.i_colorprim = ( param->vui.i_colorprim <= 9 ? param->vui.i_colorprim : 2 ); - sps->vui.i_transfer = ( param->vui.i_transfer <= 11 ? param->vui.i_transfer : 2 ); - sps->vui.i_colmatrix = ( param->vui.i_colmatrix <= 9 ? param->vui.i_colmatrix : 2 ); + sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 8 ? param->vui.i_colorprim : 2 ); + sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 10 ? param->vui.i_transfer : 2 ); + sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 8 ? param->vui.i_colmatrix : 2 ); if( sps->vui.i_colorprim != 2 \|\| sps->vui.i_transfer != 2 \|\| sps->vui.i_colmatrix != 2 ) @@ -234,7 +232,7 @@ } / FIXME: not sufficient for interlaced video / - sps->vui.b_chroma_loc_info_present = ( param->vui.i_chroma_loc ? 1 : 0 ); + sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5; if( sps->vui.b_chroma_loc_info_present ) { sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc; @@ -553,7 +551,6 @@ bs_flush( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_RECOVERY_POINT ); - } int x264_sei_version_write( x264_t h, bs_t s ) @@ -691,6 +688,38 @@ bs_flush( s ); } +void x264_sei_dec_ref_pic_marking_write( x264_t h, bs_t s ) +{ + x264_slice_header_t sh = &h->sh_backup; + bs_t q; + uint8_t tmp_buf[100]; + bs_init( &q, tmp_buf, 100 ); + + bs_realign( &q ); + + /* We currently only use this for repeating B-refs, as required by Blu-ray. */ + bs_write1( &q, 0 ); //original_idr_flag + bs_write_ue( &q, sh->i_frame_num ); //original_frame_num + if( !h->sps->b_frame_mbs_only ) + bs_write1( &q, 0 ); //original_field_pic_flag + + bs_write1( &q, sh->i_mmco_command_count > 0 ); + if( sh->i_mmco_command_count > 0 ) + { + for( int i = 0; i < sh->i_mmco_command_count; i++ ) + { + bs_write_ue( &q, 1 ); + bs_write_ue( &q, sh->mmco[i].i_difference_of_pic_nums - 1 ); + } + bs_write_ue( &q, 0 ); + } + + bs_align_10( &q ); + bs_flush( &q ); + + x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_DEC_REF_PIC_MARKING ); +} + const x264_level_t x264_levels[] = { { 10, 1485, 99, 152064, 64, 175, 64, 64, 0, 2, 0, 0, 1 },
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/set.h ^
@@ -36,6 +36,7 @@ int x264_validate_levels( x264_t h, int verbose ); void x264_sei_buffering_period_write( x264_t h, bs_t s ); void x264_sei_pic_timing_write( x264_t h, bs_t s ); +void x264_sei_dec_ref_pic_marking_write( x264_t h, bs_t s ); void x264_sei_frame_packing_write( x264_t h, bs_t s ); void x264_sei_write( bs_t s, uint8_t payload, int payload_size, int payload_type ); void x264_filler_write( x264_t h, bs_t *s, int filler );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/encoder/slicetype.c ^
@@ -25,8 +25,6 @@ * For more information, contact us at licensing@x264.com. ****************************************************************************/ -#include <math.h> - #include "common/common.h" #include "macroblock.h" #include "me.h" @@ -169,14 +167,18 @@ for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8) { w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 ); - cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); + int cmp = h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ); + cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] ); } cost += x264_weight_slice_header_cost( h, w, 0 ); } else for( int y = 0; y < i_lines; y += 8, pixoff = yi_stride ) for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 ) - cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); + { + int cmp = h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ); + cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] ); + } x264_emms(); return cost; } @@ -369,8 +371,8 @@ const int i_stride = fenc->i_stride_lowres; const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride); const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; - int16_t (fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] }; - int (fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] }; + int16_t (fenc_mvs[2])[2] = { &fenc->lowres_mvs[0][b-p0-1][i_mb_xy], &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] }; + int (fenc_costs[2]) = { &fenc->lowres_mv_costs[0][b-p0-1][i_mb_xy], &fenc->lowres_mv_costs[1][p1-b-1][i_mb_xy] }; int b_frame_score_mb = (i_mb_x > 0 && i_mb_x < h->mb.i_mb_width - 1 && i_mb_y > 0 && i_mb_y < h->mb.i_mb_height - 1) \|\| h->mb.i_mb_width <= 2 \|\| h->mb.i_mb_height <= 2; @@ -578,15 +580,14 @@ i_icost += intra_penalty; fenc->i_intra_cost[i_mb_xy] = i_icost; + int i_icost_aq = i_icost; + if( h->param.rc.i_aq_mode ) + i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; + fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq; if( b_frame_score_mb ) { - int row_satd_intra = frames[b]->i_row_satds[0][0]; - int i_icost_aq = i_icost; - if( h->param.rc.i_aq_mode ) - i_icost_aq = (i_icost_aq frames[b]->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; fenc->i_cost_est[0][0] += i_icost; fenc->i_cost_est_aq[0][0] += i_icost_aq; - row_satd_intra[h->mb.i_mb_y] += i_icost_aq; } } @@ -610,13 +611,13 @@ { int i_bcost_aq = i_bcost; if( h->param.rc.i_aq_mode ) - i_bcost_aq = (i_bcost_aq * frames[b]->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; + i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq; if( b_frame_score_mb ) { /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. / - frames[b]->i_cost_est[b-p0][p1-b] += i_bcost; - frames[b]->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq; + fenc->i_cost_est[b-p0][p1-b] += i_bcost; + fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq; } } @@ -750,8 +751,7 @@ static void x264_macroblock_tree_finish( x264_t h, x264_frame_t frame, float average_duration, int ref0_distance ) { - int fps_factor_intra = round( CLIP_DURATION(frame->f_duration) / BASE_FRAME_DURATION 256 ); - int fps_factor_propagate = round( CLIP_DURATION( average_duration) / BASE_FRAME_DURATION * 256 ); + int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 ); float weightdelta = 0.0; if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 ) weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]); @@ -762,11 +762,10 @@ for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ ) { int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8; - int intra_cost_scaled = (intra_cost * fps_factor_intra + 128) >> 8; if( intra_cost ) { - int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor_propagate + 128) >> 8; - float log2_ratio = x264_log2(intra_cost_scaled + propagate_cost) - x264_log2(intra_cost) + weightdelta; + int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor + 128) >> 8; + float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta; frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio; } } @@ -1103,9 +1102,9 @@ /* Uses strings due to the fact that the speed of the control functions is negligible compared to the cost of running slicetype_frame_cost, and because it makes debugging easier. / -static void x264_slicetype_path( x264_t h, x264_mb_analysis_t a, x264_frame_t frames, int length, char (best_paths)[X264_LOOKAHEAD_MAX] ) +static void x264_slicetype_path( x264_t h, x264_mb_analysis_t a, x264_frame_t *frames, int length, char (best_paths)[X264_LOOKAHEAD_MAX+1] ) { - char paths[2][X264_LOOKAHEAD_MAX]; + char paths[2][X264_LOOKAHEAD_MAX+1]; int num_paths = X264_MIN( h->param.i_bframe+1, length ); int best_cost = COST_MAX; int idx = 0; @@ -1256,6 +1255,8 @@ * more RD-optimal. / if( (h->param.analyse.b_psy && h->param.rc.b_mb_tree) \|\| vbv_lookahead ) num_frames = framecnt; + else if( h->param.b_open_gop && num_frames < framecnt ) + num_frames++; else if( num_frames == 0 ) { frames[1]->i_type = X264_TYPE_I; @@ -1277,11 +1278,11 @@ { if( num_frames > 1 ) { - char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX] = {"","P"}; - int best_path_index = (num_frames-1) % (X264_BFRAME_MAX+1); + char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX+1] = {"","P"}; + int best_path_index = num_frames % (X264_BFRAME_MAX+1); / Perform the frametype analysis. / - for( int j = 2; j < num_frames; j++ ) + for( int j = 2; j <= num_frames; j++ ) x264_slicetype_path( h, &a, frames, j, best_paths ); num_bframes = strspn( best_paths[best_path_index], "B" ); @@ -1375,7 +1376,7 @@ { frames[i]->i_type = X264_TYPE_I; reset_start = X264_MIN( reset_start, i+1 ); - if( h->param.i_open_gop == X264_OPEN_GOP_BLURAY ) + if( h->param.b_open_gop && h->param.b_bluray_compat ) while( IS_X264_TYPE_B( frames[i-1]->i_type ) ) i--; } @@ -1463,25 +1464,25 @@ } if( frm->i_type == X264_TYPE_KEYFRAME ) - frm->i_type = h->param.i_open_gop ? X264_TYPE_I : X264_TYPE_IDR; + frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR; / Limit GOP size / if( (!h->param.b_intra_refresh \|\| frm->i_frame == 0) && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_max ) { if( frm->i_type == X264_TYPE_AUTO \|\| frm->i_type == X264_TYPE_I ) - frm->i_type = h->param.i_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR; + frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR; int warn = frm->i_type != X264_TYPE_IDR; - if( warn && h->param.i_open_gop ) + if( warn && h->param.b_open_gop ) warn &= frm->i_type != X264_TYPE_I; if( warn ) x264_log( h, X264_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm->i_type, frm->i_frame ); } if( frm->i_type == X264_TYPE_I && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min ) { - if( h->param.i_open_gop ) + if( h->param.b_open_gop ) { h->lookahead->i_last_keyframe = frm->i_frame; // Use display order - if( h->param.i_open_gop == X264_OPEN_GOP_BLURAY ) + if( h->param.b_bluray_compat ) h->lookahead->i_last_keyframe -= bframes; // Use bluray order frm->b_keyframe = 1; } @@ -1655,7 +1656,7 @@ int ip_factor = 256 h->param.rc.f_ip_factor; /* fix8 / for( int y = 0; y < h->mb.i_mb_height; y++ ) { - int mb_xy = y h->mb.i_mb_stride; + int mb_xy = y * h->mb.i_mb_stride + h->fdec->i_pir_start_col; for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ ) { int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
[-] [+]	Added	x264-snapshot-20110622-2245.tar.bz2/extras/inttypes.h ^
@@ -0,0 +1,285 @@ +// ISO C9x compliant inttypes.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_INTTYPES_H_ // [ +#define _MSC_INTTYPES_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include "stdint.h" + +// 7.8 Format conversion of integer types + +typedef struct { + intmax_t quot; + intmax_t rem; +} imaxdiv_t; + +// 7.8.1 Macros for format specifiers + +#if !defined(__cplusplus) \|\| defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 + +// The fprintf macros for signed integers are: +#define PRId8 "d" +#define PRIi8 "i" +#define PRIdLEAST8 "d" +#define PRIiLEAST8 "i" +#define PRIdFAST8 "d" +#define PRIiFAST8 "i" + +#define PRId16 "hd" +#define PRIi16 "hi" +#define PRIdLEAST16 "hd" +#define PRIiLEAST16 "hi" +#define PRIdFAST16 "hd" +#define PRIiFAST16 "hi" + +#define PRId32 "I32d" +#define PRIi32 "I32i" +#define PRIdLEAST32 "I32d" +#define PRIiLEAST32 "I32i" +#define PRIdFAST32 "I32d" +#define PRIiFAST32 "I32i" + +#define PRId64 "I64d" +#define PRIi64 "I64i" +#define PRIdLEAST64 "I64d" +#define PRIiLEAST64 "I64i" +#define PRIdFAST64 "I64d" +#define PRIiFAST64 "I64i" + +#define PRIdMAX "I64d" +#define PRIiMAX "I64i" + +#define PRIdPTR "Id" +#define PRIiPTR "Ii" + +// The fprintf macros for unsigned integers are: +#define PRIo8 "o" +#define PRIu8 "u" +#define PRIx8 "x" +#define PRIX8 "X" +#define PRIoLEAST8 "o" +#define PRIuLEAST8 "u" +#define PRIxLEAST8 "x" +#define PRIXLEAST8 "X" +#define PRIoFAST8 "o" +#define PRIuFAST8 "u" +#define PRIxFAST8 "x" +#define PRIXFAST8 "X" + +#define PRIo16 "ho" +#define PRIu16 "hu" +#define PRIx16 "hx" +#define PRIX16 "hX" +#define PRIoLEAST16 "ho" +#define PRIuLEAST16 "hu" +#define PRIxLEAST16 "hx" +#define PRIXLEAST16 "hX" +#define PRIoFAST16 "ho" +#define PRIuFAST16 "hu" +#define PRIxFAST16 "hx" +#define PRIXFAST16 "hX" + +#define PRIo32 "I32o" +#define PRIu32 "I32u" +#define PRIx32 "I32x" +#define PRIX32 "I32X" +#define PRIoLEAST32 "I32o" +#define PRIuLEAST32 "I32u" +#define PRIxLEAST32 "I32x" +#define PRIXLEAST32 "I32X" +#define PRIoFAST32 "I32o" +#define PRIuFAST32 "I32u" +#define PRIxFAST32 "I32x" +#define PRIXFAST32 "I32X" + +#define PRIo64 "I64o" +#define PRIu64 "I64u" +#define PRIx64 "I64x" +#define PRIX64 "I64X" +#define PRIoLEAST64 "I64o" +#define PRIuLEAST64 "I64u" +#define PRIxLEAST64 "I64x" +#define PRIXLEAST64 "I64X" +#define PRIoFAST64 "I64o" +#define PRIuFAST64 "I64u" +#define PRIxFAST64 "I64x" +#define PRIXFAST64 "I64X" + +#define PRIoMAX "I64o" +#define PRIuMAX "I64u" +#define PRIxMAX "I64x" +#define PRIXMAX "I64X" + +#define PRIoPTR "Io" +#define PRIuPTR "Iu" +#define PRIxPTR "Ix" +#define PRIXPTR "IX" + +// The fscanf macros for signed integers are: +#define SCNd16 "hd" +#define SCNi16 "hi" +#define SCNdLEAST16 "hd" +#define SCNiLEAST16 "hi" +#define SCNdFAST16 "hd" +#define SCNiFAST16 "hi" + +#define SCNd32 "ld" +#define SCNi32 "li" +#define SCNdLEAST32 "ld" +#define SCNiLEAST32 "li" +#define SCNdFAST32 "ld" +#define SCNiFAST32 "li" + +#define SCNd64 "I64d" +#define SCNi64 "I64i" +#define SCNdLEAST64 "I64d" +#define SCNiLEAST64 "I64i" +#define SCNdFAST64 "I64d" +#define SCNiFAST64 "I64i" + +#define SCNdMAX "I64d" +#define SCNiMAX "I64i" + +#ifdef _WIN64 // [ +# define SCNdPTR "I64d" +# define SCNiPTR "I64i" +#else // _WIN64 ][ +# define SCNdPTR "ld" +# define SCNiPTR "li" +#endif // _WIN64 ] + +// The fscanf macros for unsigned integers are: +#define SCNo16 "ho" +#define SCNu16 "hu" +#define SCNx16 "hx" +#define SCNX16 "hX" +#define SCNoLEAST16 "ho" +#define SCNuLEAST16 "hu" +#define SCNxLEAST16 "hx" +#define SCNXLEAST16 "hX" +#define SCNoFAST16 "ho" +#define SCNuFAST16 "hu" +#define SCNxFAST16 "hx" +#define SCNXFAST16 "hX" + +#define SCNo32 "lo" +#define SCNu32 "lu" +#define SCNx32 "lx" +#define SCNX32 "lX" +#define SCNoLEAST32 "lo" +#define SCNuLEAST32 "lu" +#define SCNxLEAST32 "lx" +#define SCNXLEAST32 "lX" +#define SCNoFAST32 "lo" +#define SCNuFAST32 "lu" +#define SCNxFAST32 "lx" +#define SCNXFAST32 "lX" + +#define SCNo64 "I64o" +#define SCNu64 "I64u" +#define SCNx64 "I64x" +#define SCNX64 "I64X" +#define SCNoLEAST64 "I64o" +#define SCNuLEAST64 "I64u" +#define SCNxLEAST64 "I64x" +#define SCNXLEAST64 "I64X" +#define SCNoFAST64 "I64o" +#define SCNuFAST64 "I64u" +#define SCNxFAST64 "I64x" +#define SCNXFAST64 "I64X" + +#define SCNoMAX "I64o" +#define SCNuMAX "I64u" +#define SCNxMAX "I64x" +#define SCNXMAX "I64X" + +#ifdef _WIN64 // [ +# define SCNoPTR "I64o" +# define SCNuPTR "I64u" +# define SCNxPTR "I64x" +# define SCNXPTR "I64X" +#else // _WIN64 ][ +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" +# define SCNXPTR "lX" +#endif // _WIN64 ] + +#endif // __STDC_FORMAT_MACROS ] + +// 7.8.2 Functions for greatest-width integer types + +// 7.8.2.1 The imaxabs function +#define imaxabs _abs64 + +// 7.8.2.2 The imaxdiv function + +// This is modified version of div() function from Microsoft's div.c found +// in %MSVC.NET%\crt\src\div.c +#ifdef STATIC_IMAXDIV // [ +static +#else // STATIC_IMAXDIV ][ +_inline +#endif // STATIC_IMAXDIV ] +imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) +{ + imaxdiv_t result; + + result.quot = numer / denom; + result.rem = numer % denom; + + if (numer < 0 && result.rem > 0) { + // did division wrong; must fix up + ++result.quot; + result.rem -= denom; + } + + return result; +} + +// 7.8.2.3 The strtoimax and strtoumax functions +#define strtoimax _strtoi64 +#define strtoumax _strtoui64 + +// 7.8.2.4 The wcstoimax and wcstoumax functions +#define wcstoimax _wcstoi64 +#define wcstoumax _wcstoui64 + + +#endif // _MSC_INTTYPES_H_ ]
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/extras/stdint.h ^
@@ -117,9 +117,15 @@ /* 7.18.2.4 Limits of integer types capable of holding object pointers / +#if defined(_WIN64) \|\| defined(__LP64__) +#define INTPTR_MIN INT64_MIN +#define INTPTR_MAX INT64_MAX +#define UINTPTR_MAX UINT64_MAX +#else #define INTPTR_MIN INT32_MIN #define INTPTR_MAX INT32_MAX #define UINTPTR_MAX UINT32_MAX +#endif / 7.18.2.5 Limits of greatest-width integer types / #define INTMAX_MIN INT64_MIN @@ -127,13 +133,24 @@ #define UINTMAX_MAX UINT64_MAX / 7.18.3 Limits of other integer types / +#if defined(_WIN64) \|\| defined(__LP64__) +#define PTRDIFF_MIN INT64_MIN +#define PTRDIFF_MAX INT64_MAX +#else #define PTRDIFF_MIN INT32_MIN #define PTRDIFF_MAX INT32_MAX +#endif #define SIG_ATOMIC_MIN INT32_MIN #define SIG_ATOMIC_MAX INT32_MAX +#ifndef SIZE_MAX +#if defined(_WIN64) \|\| defined(__LP64__) +#define SIZE_MAX UINT64_MAX +#else #define SIZE_MAX UINT32_MAX +#endif +#endif #ifndef WCHAR_MIN / also in wchar.h */ #define WCHAR_MIN 0
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/filters/filters.c ^
@@ -27,7 +27,7 @@ #include "filters.h" #define RETURN_IF_ERROR( cond, ... ) RETURN_IF_ERR( cond, "options", NULL, __VA_ARGS__ ) -char *x264_split_string( char string, char sep, uint32_t limit ) +char x264_split_string( char string, char *sep, int limit ) { if( !string ) return NULL;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/filters/filters.h ^
@@ -30,7 +30,7 @@ #include "x264cli.h" #include "filters/video/video.h" -char *x264_split_string( char string, char sep, uint32_t limit ); +char x264_split_string( char string, char sep, int limit ); void x264_free_string_array( char array ); char x264_split_options( const char opt_str, const char *options[] );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/filters/video/resize.c ^
@@ -41,9 +41,8 @@ #if HAVE_SWSCALE #undef DECLARE_ALIGNED #include <libswscale/swscale.h> - -/* this function is not a part of the swscale API but is defined in swscale_internal.h / -const char sws_format_name( enum PixelFormat format ); +#include <libavutil/opt.h> +#include <libavutil/pixdesc.h> typedef struct { @@ -61,10 +60,11 @@ int buffer_allocated; int dst_csp; struct SwsContext ctx; - int ctx_flags; + uint32_t ctx_flags; / state of swapping chroma planes pre and post resize / int pre_swap_chroma; int post_swap_chroma; + int variable_input; / input is capable of changing properties / frame_prop_t dst; / desired output properties / frame_prop_t scale; / properties of the SwsContext input / } resizer_hnd_t; @@ -98,16 +98,6 @@ " - area, bicublin, gauss, sinc, lanczos, spline\n" ); } -static uint32_t convert_cpu_to_flag( uint32_t cpu ) -{ - uint32_t swscale_cpu = 0; - if( cpu & X264_CPU_ALTIVEC ) - swscale_cpu \|= SWS_CPU_CAPS_ALTIVEC; - if( cpu & X264_CPU_MMXEXT ) - swscale_cpu \|= SWS_CPU_CAPS_MMX \| SWS_CPU_CAPS_MMX2; - return swscale_cpu; -} - static uint32_t convert_method_to_flag( const char name ) { uint32_t flag = 0; @@ -348,6 +338,54 @@ return 0; } +static int x264_init_sws_context( resizer_hnd_t h ) +{ + if( !h->ctx ) + { + h->ctx = sws_alloc_context(); + if( !h->ctx ) + return -1; + + / set flags that will not change / + av_set_int( h->ctx, "sws_flags", h->ctx_flags ); + av_set_int( h->ctx, "dstw", h->dst.width ); + av_set_int( h->ctx, "dsth", h->dst.height ); + av_set_int( h->ctx, "dst_format", h->dst.pix_fmt ); + av_set_int( h->ctx, "dst_range", 0 ); / FIXME: use the correct full range value / + } + + av_set_int( h->ctx, "srcw", h->scale.width ); + av_set_int( h->ctx, "srch", h->scale.height ); + av_set_int( h->ctx, "src_format", h->scale.pix_fmt ); + av_set_int( h->ctx, "src_range", 0 ); / FIXME: use the correct full range value / + + / FIXME: use the correct full range values + * FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) / + sws_setColorspaceDetails( h->ctx, sws_getCoefficients( SWS_CS_DEFAULT ), 0, + sws_getCoefficients( SWS_CS_DEFAULT ), 0, 0, 1<<16, 1<<16 ); + + return sws_init_context( h->ctx, NULL, NULL ) < 0; +} + +static int check_resizer( resizer_hnd_t h, cli_pic_t in, int frame ) +{ + frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ) }; + if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) ) + return 0; + / also warn if the resizer was initialized after the first frame / + if( h->ctx \|\| frame ) + x264_cli_log( NAME, X264_LOG_WARNING, "stream properties changed at pts %"PRId64"\n", in->pts ); + h->scale = input_prop; + if( !h->buffer_allocated ) + { + if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, h->dst.width, h->dst.height ) ) + return -1; + h->buffer_allocated = 1; + } + FAIL_IF_ERROR( x264_init_sws_context( h ), "swscale init failed\n" ) + return 0; +} + static int init( hnd_t handle, cli_vid_filter_t filter, video_info_t info, x264_param_t param, char opt_string ) { /* if called for normalizing the csp to known formats and the format is not unknown, exit / @@ -372,6 +410,8 @@ h->dst.height = info->height; if( !strcmp( opt_string, "normcsp" ) ) { + / only in normalization scenarios is the input capable of changing properties / + h->variable_input = 1; h->dst_csp = pick_closest_supported_csp( info->csp ); / now fix the catch-all i420 choice if it does not allow for the current input resolution dimensions. / if( h->dst_csp == X264_CSP_I420 && info->width&1 ) @@ -388,11 +428,10 @@ h->dst.width = param->i_width; h->dst.height = param->i_height; } - uint32_t method = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) ); + h->ctx_flags = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) ); x264_free_string_array( opts ); - h->ctx_flags = convert_cpu_to_flag( param->cpu ) \| method; - if( method != SWS_FAST_BILINEAR ) + if( h->ctx_flags != SWS_FAST_BILINEAR ) h->ctx_flags \|= SWS_FULL_CHR_H_INT \| SWS_FULL_CHR_H_INP \| SWS_ACCURATE_RND; h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp ); h->scale = h->dst; @@ -408,13 +447,13 @@ / confirm swscale can support this conversion / FAIL_IF_ERROR( src_pix_fmt == PIX_FMT_NONE && src_pix_fmt_inv != PIX_FMT_NONE, - "input colorspace %s with bit depth %d is not supported\n", sws_format_name( src_pix_fmt_inv ), + "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( src_pix_fmt_inv ), info->csp & X264_CSP_HIGH_DEPTH ? 16 : 8 ); - FAIL_IF_ERROR( !sws_isSupportedInput( src_pix_fmt ), "input colorspace %s is not supported\n", sws_format_name( src_pix_fmt ) ) + FAIL_IF_ERROR( !sws_isSupportedInput( src_pix_fmt ), "input colorspace %s is not supported\n", av_get_pix_fmt_name( src_pix_fmt ) ) FAIL_IF_ERROR( h->dst.pix_fmt == PIX_FMT_NONE && dst_pix_fmt_inv != PIX_FMT_NONE, - "input colorspace %s with bit depth %d is not supported\n", sws_format_name( dst_pix_fmt_inv ), + "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( dst_pix_fmt_inv ), h->dst_csp & X264_CSP_HIGH_DEPTH ? 16 : 8 ); - FAIL_IF_ERROR( !sws_isSupportedOutput( h->dst.pix_fmt ), "output colorspace %s is not supported\n", sws_format_name( h->dst.pix_fmt ) ) + FAIL_IF_ERROR( !sws_isSupportedOutput( h->dst.pix_fmt ), "output colorspace %s is not supported\n", av_get_pix_fmt_name( h->dst.pix_fmt ) ) FAIL_IF_ERROR( h->dst.height != info->height && info->interlaced, "swscale is not compatible with interlaced vertical resizing\n" ) / confirm that the desired resolution meets the colorspace requirements / @@ -426,8 +465,17 @@ x264_cli_log( NAME, X264_LOG_INFO, "resizing to %dx%d\n", h->dst.width, h->dst.height ); if( h->dst.pix_fmt != src_pix_fmt ) x264_cli_log( NAME, X264_LOG_WARNING, "converting from %s to %s\n", - sws_format_name( src_pix_fmt ), sws_format_name( h->dst.pix_fmt ) ); + av_get_pix_fmt_name( src_pix_fmt ), av_get_pix_fmt_name( h->dst.pix_fmt ) ); h->dst_csp \|= info->csp & X264_CSP_VFLIP; // preserve vflip + + / if the input is not variable, initialize the context / + if( !h->variable_input ) + { + cli_pic_t input_pic = {{info->csp, info->width, info->height, 0}, 0}; + if( check_resizer( h, &input_pic, 0 ) ) + return -1; + } + / finished initing, overwrite values / info->csp = h->dst_csp; info->width = h->dst.width; @@ -441,35 +489,12 @@ return 0; } -static int check_resizer( resizer_hnd_t h, cli_pic_t in ) -{ - frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ) }; - if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) ) - return 0; - if( h->ctx ) - { - sws_freeContext( h->ctx ); - x264_cli_log( NAME, X264_LOG_WARNING, "stream properties changed at pts %"PRId64"\n", in->pts ); - } - h->scale = input_prop; - if( !h->buffer_allocated ) - { - if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, h->dst.width, h->dst.height ) ) - return -1; - h->buffer_allocated = 1; - } - h->ctx = sws_getContext( h->scale.width, h->scale.height, h->scale.pix_fmt, h->dst.width, - h->dst.height, h->dst.pix_fmt, h->ctx_flags, NULL, NULL, NULL ); - FAIL_IF_ERROR( !h->ctx, "swscale init failed\n" ) - return 0; -} - static int get_frame( hnd_t handle, cli_pic_t output, int frame ) { resizer_hnd_t h = handle; if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) ) return -1; - if( check_resizer( h, output ) ) + if( h->variable_input && check_resizer( h, output, frame ) ) return -1; if( h->pre_swap_chroma ) XCHG( uint8_t, output->img.plane[1], output->img.plane[2] );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/input/ffms.c ^
@@ -85,8 +85,13 @@ } if( !idx ) { - idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, &h->time, &e ); - fprintf( stderr, " \r" ); + if( opt->progress ) + { + idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, &h->time, &e ); + fprintf( stderr, " \r" ); + } + else + idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, NULL, NULL, &e ); FAIL_IF_ERROR( !idx, "could not create index\n" ) if( opt->index_file && FFMS_WriteIndex( opt->index_file, idx, &e ) ) x264_cli_log( "ffms", X264_LOG_WARNING, "could not write index file\n" );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/input/input.h ^
@@ -40,6 +40,7 @@ int bit_depth; char timebase; int seek; + int progress; } cli_input_opt_t; / properties of the source given by the demuxer */
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/input/lavf.c ^
@@ -145,6 +145,7 @@ return -1; sscanf( opt->resolution, "%dx%d", &param->width, &param->height ); param->pix_fmt = opt->colorspace ? av_get_pix_fmt( opt->colorspace ) : PIX_FMT_YUV420P; + FAIL_IF_ERROR( param->pix_fmt == PIX_FMT_NONE, "unsupported colorspace: %s\n", opt->colorspace ); } /* specify the input format. this is helpful when lavf fails to guess */ @@ -158,7 +159,7 @@ FAIL_IF_ERROR( av_find_stream_info( h->lavf ) < 0, "could not find input stream info\n" ) int i = 0; - while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != CODEC_TYPE_VIDEO ) + while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != AVMEDIA_TYPE_VIDEO ) i++; FAIL_IF_ERROR( i == h->lavf->nb_streams, "could not find video stream\n" ) h->stream_id = i;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/input/timecode.c ^
@@ -25,7 +25,6 @@ #include "input.h" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "timecode", __VA_ARGS__ ) -#include <math.h> typedef struct {
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/input/y4m.c ^
@@ -201,7 +201,7 @@ static int read_frame_internal( cli_pic_t pic, y4m_hnd_t h ) { - int slen = strlen( Y4M_FRAME_MAGIC ); + size_t slen = strlen( Y4M_FRAME_MAGIC ); int i = 0; char header[16];
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/output/mp4.c ^
@@ -30,8 +30,10 @@ #if HAVE_GF_MALLOC #undef malloc #undef free +#undef realloc #define malloc gf_malloc #define free gf_free +#define realloc gf_realloc #endif typedef struct @@ -49,6 +51,7 @@ int i_delay_frames; int b_dts_compress; int i_dts_compress_multiplier; + int i_data_size; } mp4_hnd_t; static void recompute_bitrate_mp4( GF_ISOFile p_file, int i_track ) @@ -233,10 +236,27 @@ gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 ); } - p_mp4->p_sample->data = malloc( p_param->i_width p_param->i_height * 3 / 2 ); + p_mp4->i_data_size = p_param->i_width * p_param->i_height * 3 / 2; + p_mp4->p_sample->data = malloc( p_mp4->i_data_size ); if( !p_mp4->p_sample->data ) + { + p_mp4->i_data_size = 0; return -1; + } + + return 0; +} +static int check_buffer( mp4_hnd_t p_mp4, int needed_size ) +{ + if( needed_size > p_mp4->i_data_size ) + { + void ptr = realloc( p_mp4->p_sample->data, needed_size ); + if( !ptr ) + return -1; + p_mp4->p_sample->data = ptr; + p_mp4->i_data_size = needed_size; + } return 0; } @@ -284,6 +304,8 @@ // SEI + if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + sei_size ) ) + return -1; memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, sei, sei_size ); p_mp4->p_sample->dataLength += sei_size; @@ -296,6 +318,8 @@ int64_t dts; int64_t cts; + if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + i_size ) ) + return -1; memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size ); p_mp4->p_sample->dataLength += i_size;
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/tools/checkasm.c ^
@@ -26,10 +26,6 @@ ****************************************************************************/ #include <ctype.h> -#include <stdlib.h> -#include <limits.h> -#include <math.h> - #include "common/common.h" #include "common/cpu.h" @@ -61,14 +57,16 @@ #define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions #define MAX_CPUS 10 // number of different combinations of cpu flags -typedef struct { +typedef struct +{ void pointer; // just for detecting duplicates uint32_t cpu; uint32_t cycles; uint32_t den; } bench_t; -typedef struct { +typedef struct +{ char name; bench_t vers[MAX_CPUS]; } bench_func_t; @@ -90,7 +88,7 @@ static inline uint32_t read_time(void) { uint32_t a = 0; -#if defined(__GNUC__) && (ARCH_X86 \|\| ARCH_X86_64) +#if HAVE_X86_INLINE_ASM asm volatile( "rdtsc" :"=a"(a) ::"edx" ); #elif ARCH_PPC asm volatile( "mftb %0" : "=r" (a) ); @@ -419,6 +417,26 @@ } report( "pixel hadamard_ac :" ); + ok = 1; used_asm = 0; + if( pixel_asm.vsad != pixel_ref.vsad ) + { + for( int h = 2; h <= 32; h += 2 ) + { + int res_c, res_asm; + set_func_name( "vsad" ); + used_asm = 1; + res_c = call_c( pixel_c.vsad, pbuf1, 16, h ); + res_asm = call_a( pixel_asm.vsad, pbuf1, 16, h ); + if( res_c != res_asm ) + { + ok = 0; + fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm ); + break; + } + } + } + report( "pixel vsad :" ); + #define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ @@ -532,7 +550,7 @@ x264_dct_function_t dct_ref; x264_dct_function_t dct_asm; x264_quant_function_t qf; - int ret = 0, ok, used_asm, interlace; + int ret = 0, ok, used_asm, interlace = 0; ALIGNED_16( dctcoef dct1[16][16] ); ALIGNED_16( dctcoef dct2[16][16] ); ALIGNED_16( dctcoef dct4[16][16] ); @@ -697,21 +715,21 @@ TEST_DCTDC( idct4x4dc ); #undef TEST_DCTDC - x264_zigzag_function_t zigzag_c; - x264_zigzag_function_t zigzag_ref; - x264_zigzag_function_t zigzag_asm; + x264_zigzag_function_t zigzag_c[2]; + x264_zigzag_function_t zigzag_ref[2]; + x264_zigzag_function_t zigzag_asm[2]; ALIGNED_16( dctcoef level1[64] ); ALIGNED_16( dctcoef level2[64] ); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy(dct, buf1, sizesizeof(dctcoef)); \ - call_c( zigzag_c.name, t1, dct ); \ - call_a( zigzag_asm.name, t2, dct ); \ + call_c( zigzag_c[interlace].name, t1, dct ); \ + call_a( zigzag_asm[interlace].name, t2, dct ); \ if( memcmp( t1, t2, sizesizeof(dctcoef) ) ) \ { \ ok = 0; \ @@ -720,26 +738,26 @@ } #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy( pbuf3, pbuf1, 16FDEC_STRIDE * sizeof(pixel) ); \ memcpy( pbuf4, pbuf1, 16FDEC_STRIDE sizeof(pixel) ); \ - nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3 ); \ - nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4 ); \ + nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ + nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ if( memcmp( t1, t2, sizesizeof(dctcoef) ) \|\| memcmp( pbuf3, pbuf4, 16FDEC_STRIDEsizeof(pixel) ) \|\| nz_c != nz_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ - call_c2( zigzag_c.name, t1, pbuf2, pbuf3 ); \ - call_a2( zigzag_asm.name, t2, pbuf2, pbuf4 ); \ + call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ + call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ } #define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ dctcoef dc_a, dc_c; \ @@ -754,8 +772,8 @@ memcpy( pbuf3 + jFDEC_STRIDE, (i?pbuf1:pbuf2) + jFENC_STRIDE, 4 sizeof(pixel) ); \ memcpy( pbuf4 + jFDEC_STRIDE, (i?pbuf1:pbuf2) + jFENC_STRIDE, 4 * sizeof(pixel) ); \ } \ - nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \ - nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \ + nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ + nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ if( memcmp( t1+1, t2+1, 15sizeof(dctcoef) ) \|\| memcmp( pbuf3, pbuf4, 16FDEC_STRIDE * sizeof(pixel) ) \|\| nz_c != nz_a \|\| dc_c != dc_a ) \ { \ ok = 0; \ @@ -763,12 +781,12 @@ break; \ } \ } \ - call_c2( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \ - call_a2( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \ + call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ + call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ } #define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ for( int j = 0; j < 100; j++ ) \ { \ @@ -778,8 +796,8 @@ for( int i = 0; i < size; i++ ) \ dct[i] = rand()&0x1F ? 0 : dct[i]; \ memcpy(buf3, buf4, 10); \ - call_c( zigzag_c.name, t1, dct, buf3 ); \ - call_a( zigzag_asm.name, t2, dct, buf4 ); \ + call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \ + call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \ if( memcmp( t1, t2, sizesizeof(dctcoef) ) \|\| memcmp( buf3, buf4, 10 ) ) \ { \ ok = 0; \ @@ -787,33 +805,23 @@ } \ } - interlace = 0; - x264_zigzag_init( 0, &zigzag_c, 0 ); - x264_zigzag_init( cpu_ref, &zigzag_ref, 0 ); - x264_zigzag_init( cpu_new, &zigzag_asm, 0 ); - - ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void)dct1, 64 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); - TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); - report( "zigzag_frame :" ); - - interlace = 1; - x264_zigzag_init( 0, &zigzag_c, 1 ); - x264_zigzag_init( cpu_ref, &zigzag_ref, 1 ); - x264_zigzag_init( cpu_new, &zigzag_asm, 1 ); - - ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void)dct1, 64 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); - TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); - report( "zigzag_field :" ); + x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] ); + x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] ); + x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] ); ok = 1; used_asm = 0; TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 ); report( "zigzag_interleave :" ); + + for( interlace = 0; interlace <= 1; interlace++ ) + { + ok = 1; used_asm = 0; + TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void)dct1, 64 ); + TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); + TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); + TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); + report( interlace ? "zigzag_field :" : "zigzag_frame :" ); + } #undef TEST_ZIGZAG_SCAN #undef TEST_ZIGZAG_SUB @@ -1247,8 +1255,8 @@ int dstc = dsta+400; uint16_t prop = (uint16_t)buf1; uint16_t intra = (uint16_t)buf4; - uint16_t inter = intra+100; - uint16_t qscale = inter+100; + uint16_t inter = intra+128; + uint16_t qscale = inter+128; uint16_t rnd = (uint16_t)buf2; x264_emms(); for( int j = 0; j < 100; j++ ) @@ -1268,6 +1276,44 @@ report( "mbtree propagate :" ); } + if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned ) + { + set_func_name( "memcpy_aligned" ); + ok = 1; used_asm = 1; + for( int size = 16; size < 256; size += 16 ) + { + memset( buf4, 0xAA, size + 1 ); + call_c( mc_c.memcpy_aligned, buf3, buf1, size ); + call_a( mc_a.memcpy_aligned, buf4, buf1, size ); + if( memcmp( buf3, buf4, size ) \|\| buf4[size] != 0xAA ) + { + ok = 0; + fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size ); + break; + } + } + report( "memcpy aligned :" ); + } + + if( mc_a.memzero_aligned != mc_ref.memzero_aligned ) + { + set_func_name( "memzero_aligned" ); + ok = 1; used_asm = 1; + for( int size = 128; size < 1024; size += 128 ) + { + memset( buf4, 0xAA, size + 1 ); + call_c( mc_c.memzero_aligned, buf3, size ); + call_a( mc_a.memzero_aligned, buf4, size ); + if( memcmp( buf3, buf4, size ) \|\| buf4[size] != 0xAA ) + { + ok = 0; + fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size ); + break; + } + } + report( "memzero aligned :" ); + } + return ret; } @@ -1280,9 +1326,9 @@ int alphas[36], betas[36]; int8_t tcs[36][4]; - x264_deblock_init( 0, &db_c ); - x264_deblock_init( cpu_ref, &db_ref ); - x264_deblock_init( cpu_new, &db_a ); + x264_deblock_init( 0, &db_c, 0 ); + x264_deblock_init( cpu_ref, &db_ref, 0 ); + x264_deblock_init( cpu_new, &db_a, 0 ); / not exactly the real values of a,b,tc but close enough / for( int i = 35, a = 255, c = 250; i >= 0; i-- ) @@ -1337,7 +1383,8 @@ ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] ); ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] ); - ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][4][4] ); + ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][8][4] ); + memset( bs, 99, sizeof(bs) ); for( int j = 0; j < X264_SCAN8_SIZE; j++ ) nnz[j] = ((rand()&7) == 7) rand() & 0xf; for( int j = 0; j < 2; j++ ) @@ -1348,8 +1395,8 @@ mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512; } set_func_name( "deblock_strength" ); - call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) ); - call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) ); + call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1), NULL ); + call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1), NULL ); if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) ) { ok = 0; @@ -1384,7 +1431,7 @@ ALIGNED_16( dctcoef dct2[64] ); ALIGNED_16( uint8_t cqm_buf[64] ); int ret = 0, ok, used_asm; - int oks[2] = {1,1}, used_asms[2] = {0,0}; + int oks[3] = {1,1,1}, used_asms[3] = {0,0,0}; x264_t h_buf; x264_t h = &h_buf; memset( h, 0, sizeof(h) ); @@ -1558,6 +1605,41 @@ TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 ); +#define TEST_OPTIMIZE_CHROMA_DC( qname, optname, w ) \ + if( qf_a.optname != qf_ref.optname ) \ + { \ + set_func_name( #optname ); \ + used_asms[2] = 1; \ + for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ + { \ + int dmf = h->dequant4_mf[CQM_4IC][qp%6][0] << qp/6; \ + if( dmf > 3264 ) \ + continue; \ + for( int i = 16; ; i <<= 1 )\ + { \ + int res_c, res_asm; \ + int max = X264_MIN( i, PIXEL_MAX16 ); \ + for( int j = 0; j < ww; j++ ) \ + dct1[j] = rand()%(max2+1) - max; \ + call_c1( qf_c.qname, dct1, h->quant4_mf[CQM_4IC][qp][0]>>1, h->quant4_bias[CQM_4IC][qp][0]>>1 ); \ + memcpy( dct2, dct1, wwsizeof(dctcoef) ); \ + res_c = call_c1( qf_c.optname, dct1, dmf ); \ + res_asm = call_a1( qf_a.optname, dct2, dmf ); \ + if( res_c != res_asm \|\| memcmp( dct1, dct2, wwsizeof(dctcoef) ) ) \ + { \ + oks[2] = 0; \ + fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \ + } \ + call_c2( qf_c.optname, dct1, dmf ); \ + call_a2( qf_a.optname, dct2, dmf ); \ + if( i >= PIXEL_MAX*16 ) \ + break; \ + } \ + } \ + } + + TEST_OPTIMIZE_CHROMA_DC( quant_2x2_dc, optimize_chroma_dc, 2 ); + x264_cqm_delete( h ); } @@ -1567,6 +1649,9 @@ ok = oks[1]; used_asm = used_asms[1]; report( "dequant :" ); + ok = oks[2]; used_asm = used_asms[2]; + report( "optimize chroma dc :" ); + ok = 1; used_asm = 0; if( qf_a.denoise_dct != qf_ref.denoise_dct ) { @@ -1858,6 +1943,7 @@ int ret = 0, ok, used_asm = 1; if( cpu_ref \|\| run_cabac_decision_c == run_cabac_decision_asm ) return 0; + x264_cabac_init(); set_func_name( "cabac_encode_decision" ); memcpy( buf4, buf3, 0x1000 );
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/tools/test_x264.py ^
@@ -35,6 +35,7 @@ ("", "--intra-refresh"), ("", "--no-cabac"), ("", "--interlaced"), + ("", "--slice-max-size 1000"), ("", "--frame-packing 5"), [ "--preset %s" % p for p in ("ultrafast", "superfast", @@ -260,6 +261,7 @@ ffmpeg_proc = Popen([ "ffmpeg", + "-vsync 0", "-i", "%s.264" % self.fixture.dispatcher.video, "ffmpeg-output.yuv"
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/x264.c ^
@@ -27,13 +27,9 @@ * For more information, contact us at licensing@x264.com. ****************************************************************************/ -#include <stdlib.h> -#include <math.h> - #include <signal.h> #define _GNU_SOURCE #include <getopt.h> - #include "common/common.h" #include "x264cli.h" #include "input/input.h" @@ -74,6 +70,8 @@ b_ctrl_c = 1; } +static char UNUSED originalCTitle[200] = ""; + typedef struct { int b_progress; int i_seek; @@ -124,7 +122,8 @@ static const char const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 }; static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 }; -typedef struct{ +typedef struct +{ int mod; uint8_t pattern[24]; float fps_factor; @@ -227,10 +226,12 @@ printf( "(ffmpegsource %d.%d.%d.%d)\n", FFMS_VERSION >> 24, (FFMS_VERSION & 0xff0000) >> 16, (FFMS_VERSION & 0xff00) >> 8, FFMS_VERSION & 0xff ); #endif printf( "built on " __DATE__ ", " ); -#ifdef __GNUC__ +#ifdef __INTEL_COMPILER + printf( "intel: %.2f (%d)\n", __INTEL_COMPILER / 100.f, __INTEL_COMPILER_BUILD_DATE ); +#elif defined(__GNUC__) printf( "gcc: " __VERSION__ "\n" ); #else - printf( "using a non-gcc compiler\n" ); + printf( "using an unknown compiler\n" ); #endif printf( "configuration: --bit-depth=%d\n", x264_bit_depth ); printf( "x264 license: " ); @@ -262,10 +263,15 @@ _setmode(_fileno(stdout), _O_BINARY); #endif + GetConsoleTitle( originalCTitle, sizeof(originalCTitle) ); + /* Parse command line / if( parse( argc, argv, &param, &opt ) < 0 ) ret = -1; + / Restore title; it can be changed by input modules / + SetConsoleTitle( originalCTitle ); + / Control-C handler / signal( SIGINT, sigint_handler ); @@ -284,6 +290,8 @@ if( opt.qpfile ) fclose( opt.qpfile ); + SetConsoleTitle( originalCTitle ); + return ret; } @@ -323,11 +331,11 @@ printf( "\n" ); printf( " - valid csps for `lavf' demuxer:\n" ); printf( INDENT ); - int line_len = strlen( INDENT ); + size_t line_len = strlen( INDENT ); for( enum PixelFormat i = PIX_FMT_NONE+1; i < PIX_FMT_NB; i++ ) { const char pfname = av_pix_fmt_descriptors[i].name; - int name_len = strlen( pfname ); + size_t name_len = strlen( pfname ); if( line_len + name_len > (80 - strlen( ", " )) ) { printf( "\n" INDENT ); @@ -533,11 +541,7 @@ " - strict: Strictly hierarchical pyramid\n" " - normal: Non-strict (not Blu-ray compatible)\n", strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) ); - H1( " --open-gop <string> Use recovery points to close GOPs [none]\n" - " - none: closed GOPs only\n" - " - normal: standard open GOPs\n" - " (not Blu-ray compatible)\n" - " - bluray: Blu-ray-compatible open GOPs\n" + H1( " --open-gop Use recovery points to close GOPs\n" " Only available with b-frames\n" ); H1( " --no-cabac Disable CABAC\n" ); H1( " -r, --ref <integer> Number of reference frames [%d]\n", defaults->i_frame_reference ); @@ -733,6 +737,7 @@ H0( " --seek <integer> First frame to encode\n" ); H0( " --frames <integer> Maximum number of frames to encode\n" ); H0( " --level <string> Specify level (as defined by Annex A)\n" ); + H1( " --bluray-compat Enable compatibility hacks for Blu-ray support\n" ); H1( "\n" ); H1( " -v, --verbose Print stats for each frame\n" ); H1( " --no-progress Don't show the progress indicator while encoding\n" ); @@ -823,7 +828,8 @@ { "no-b-adapt", no_argument, NULL, 0 }, { "b-bias", required_argument, NULL, 0 }, { "b-pyramid", required_argument, NULL, 0 }, - { "open-gop", required_argument, NULL, 0 }, + { "open-gop", no_argument, NULL, 0 }, + { "bluray-compat", no_argument, NULL, 0 }, { "min-keyint", required_argument, NULL, 'i' }, { "keyint", required_argument, NULL, 'I' }, { "intra-refresh", no_argument, NULL, 0 }, @@ -1393,6 +1399,8 @@ info.tff = param->b_tff; info.vfr = param->b_vfr_input; + input_opt.progress = opt->b_progress; + if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) ) return -1; @@ -1488,11 +1496,15 @@ if( !b_user_interlaced && info.interlaced ) { +#if HAVE_INTERLACED x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, enabling %cff interlaced mode.\n" " If you want otherwise, use --no-interlaced or --%cff\n", info.tff ? 't' : 'b', info.tff ? 'b' : 't' ); param->b_interlaced = 1; param->b_tff = !!info.tff; +#else + x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, but not compiled with interlaced support\n" ); +#endif } /* Automatically reduce reference frame count to match the user's target level @@ -1644,9 +1656,6 @@ double duration; double pulldown_pts = 0; int retval = 0; - char UNUSED originalCTitle[200] = ""; - - GetConsoleTitle( originalCTitle, sizeof(originalCTitle) ); opt->b_progress &= param->i_log_level < X264_LOG_DEBUG; @@ -1805,7 +1814,5 @@ (double) i_file * 8 / ( 1000 * duration ) ); } - SetConsoleTitle( originalCTitle ); - return retval; }
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/x264.h ^
@@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 114 +#define X264_BUILD 115 /* x264_t: * opaque handler for encoder / @@ -162,9 +162,6 @@ #define X264_B_PYRAMID_NORMAL 2 #define X264_KEYINT_MIN_AUTO 0 #define X264_KEYINT_MAX_INFINITE (1<<30) -#define X264_OPEN_GOP_NONE 0 -#define X264_OPEN_GOP_NORMAL 1 -#define X264_OPEN_GOP_BLURAY 2 static const char const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 }; static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 }; @@ -176,7 +173,6 @@ static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 }; static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 }; static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; -static const char * const x264_open_gop_names[] = { "none", "normal", "bluray", 0 }; /* Colorspace type / #define X264_CSP_MASK 0x00ff / / @@ -281,7 +277,8 @@ int i_bframe_adaptive; int i_bframe_bias; int i_bframe_pyramid; / Keep some B-frames as references: 0=off, 1=strict hierarchical, 2=normal / - int i_open_gop; / Open gop: 1=display order, 2=bluray compatibility braindamage mode / + int b_open_gop; + int b_bluray_compat; int b_deblocking_filter; int i_deblocking_filter_alphac0; / [-6, 6] -6 light filter, 6 strong / @@ -385,7 +382,8 @@ / Cropping Rectangle parameters: added to those implicitly defined by non-mod16 video resolutions. / - struct { + struct + { unsigned int i_left; unsigned int i_top; unsigned int i_right; @@ -480,7 +478,8 @@ H.264 level restriction information ***************************************************************************/ -typedef struct { +typedef struct +{ int level_idc; int mbps; / max macroblock processing rate (macroblocks/sec) / int frame_size; / max frame size (macroblocks) */
[-] [+]	Changed	x264-snapshot-20110622-2245.tar.bz2/x264cli.h ^
@@ -34,7 +34,7 @@ typedef void hnd_t; -static inline int64_t gcd( int64_t a, int64_t b ) +static inline uint64_t gcd( uint64_t a, uint64_t b ) { while( 1 ) { @@ -46,7 +46,7 @@ } } -static inline int64_t lcm( int64_t a, int64_t b ) +static inline uint64_t lcm( uint64_t a, uint64_t b ) { return ( a / gcd( a, b ) ) b; }

Changes of Revision 3