Changes - j0ke.net Open Build Service

Changes of Revision 10

[-] [+]	Changed	x264.spec
@@ -2,19 +2,19 @@ #!BuildIgnore: post-build-checks %define binname x264 %define realname libx264 -%define soname 65 -%define svn 20081218 -%define sonamecompa 64 -%define svncompa 20081001 -%define sonamecompb 60 -%define svncompb 20080829 -%define sonamecompc 57 -%define svncompc 20080126 +%define soname 67 +%define svn 20090627 +%define sonamecompa 66 +%define svncompa 20090228 +%define sonamecompb 65 +%define svncompb 20090119 +%define sonamecompc 64 +%define svncompc 20081001 Name: %{binname} -Summary: A free h264/avc encoder - encoder binary -Version: 0.0svn20081218 -Release: 5 +Summary: A free h264/avc encoder - encoder binary. +Version: 0.0svn%{svn} +Release: 15 License: GNU General Public License (GPL) Group: Productivity/Multimedia/Video URL: http://developers.videolan.org/x264.html @@ -118,7 +118,7 @@ %package -n %{realname}-devel -Summary: Libraries and include file for the x264 encoder +Summary: Libraries and include file for the x264 encoder. Group: Development/Libraries/C and C++ Requires: %{realname}-%{soname} = %{version}-%{release} Requires: %{buildrequires} @@ -194,30 +194,17 @@ %clean %__rm -rf "%{buildroot}" -%post -n %{realname}-%{soname} -/sbin/ldconfig - -%postun -n %{realname}-%{soname} -/sbin/ldconfig - -%post -n %{realname}-%{sonamecompa} -/sbin/ldconfig - -%postun -n %{realname}-%{sonamecompa} -/sbin/ldconfig - -%post -n %{realname}-%{sonamecompb} -/sbin/ldconfig - -%postun -n %{realname}-%{sonamecompb} -/sbin/ldconfig +%post -n %{realname}-%{soname} -p /sbin/ldconfig +%postun -n %{realname}-%{soname} -p /sbin/ldconfig -%post -n %{realname}-%{sonamecompc} -/sbin/ldconfig +%post -n %{realname}-%{sonamecompa} -p /sbin/ldconfig +%postun -n %{realname}-%{sonamecompa} -p /sbin/ldconfig -%postun -n %{realname}-%{sonamecompc} -/sbin/ldconfig +%post -n %{realname}-%{sonamecompb} -p /sbin/ldconfig +%postun -n %{realname}-%{sonamecompb} -p /sbin/ldconfig +%post -n %{realname}-%{sonamecompc} -p /sbin/ldconfig +%postun -n %{realname}-%{sonamecompc} -p /sbin/ldconfig %files %defattr(-,root,root) @@ -225,28 +212,28 @@ %attr(0755,root,root) %{_bindir}/x264 %files -n %{realname}-%{soname} -%defattr(-,root,root) +%defattr(0644,root,root) %attr(0755,root,root) %{_libdir}/libx264.so.%{soname} %files -n %{realname}-%{sonamecompa} -%defattr(-,root,root) +%defattr(0644,root,root) %attr(0755,root,root) %{_libdir}/libx264.so.%{sonamecompa} %files -n %{realname}-%{sonamecompb} -%defattr(-,root,root) +%defattr(0644,root,root) %attr(0755,root,root) %{_libdir}/libx264.so.%{sonamecompb} %files -n %{realname}-%{sonamecompc} -%defattr(-,root,root) +%defattr(0644,root,root) %attr(0755,root,root) %{_libdir}/libx264.so.%{sonamecompc} %files -n %{realname}-devel -%defattr(-,root,root) +%defattr(0644,root,root) %{_libdir}/pkgconfig/x264.pc %{_includedir}/x264.h %{_libdir}/libx264.so %{_libdir}/libx264.a %changelog -* Sun Sep 30 2007 Carsten Schoene <cs@linux-administrator.com> -- import for SLE_10 build +* Mon Jun 29 2009 Carsten Schoene <cs@linux-administrator.com> - 20090627-15 +- import for SLE_11 build x 1 @@ -2,19 +2,19 @@ 2 #!BuildIgnore: post-build-checks 3 %define binname x264 4 %define realname libx264 5 -%define soname 65 6 -%define svn 20081218 7 -%define sonamecompa 64 8 -%define svncompa 20081001 9 -%define sonamecompb 60 10 -%define svncompb 20080829 11 -%define sonamecompc 57 12 -%define svncompc 20080126 13 +%define soname 67 14 +%define svn 20090627 15 +%define sonamecompa 66 16 +%define svncompa 20090228 17 +%define sonamecompb 65 18 +%define svncompb 20090119 19 +%define sonamecompc 64 20 +%define svncompc 20081001 21 22 Name: %{binname} 23 -Summary: A free h264/avc encoder - encoder binary 24 -Version: 0.0svn20081218 25 -Release: 5 26 +Summary: A free h264/avc encoder - encoder binary. 27 +Version: 0.0svn%{svn} 28 +Release: 15 29 License: GNU General Public License (GPL) 30 Group: Productivity/Multimedia/Video 31 URL: http://developers.videolan.org/x264.html 32 @@ -118,7 +118,7 @@ 33 34 35 %package -n %{realname}-devel 36 -Summary: Libraries and include file for the x264 encoder 37 +Summary: Libraries and include file for the x264 encoder. 38 Group: Development/Libraries/C and C++ 39 Requires: %{realname}-%{soname} = %{version}-%{release} 40 Requires: %{buildrequires} 41 @@ -194,30 +194,17 @@ 42 %clean 43 %__rm -rf "%{buildroot}" 44 45 -%post -n %{realname}-%{soname} 46 -/sbin/ldconfig 47 - 48 -%postun -n %{realname}-%{soname} 49 -/sbin/ldconfig 50 - 51 -%post -n %{realname}-%{sonamecompa} 52 -/sbin/ldconfig 53 - 54 -%postun -n %{realname}-%{sonamecompa} 55 -/sbin/ldconfig 56 - 57 -%post -n %{realname}-%{sonamecompb} 58 -/sbin/ldconfig 59 - 60 -%postun -n %{realname}-%{sonamecompb} 61 -/sbin/ldconfig 62 +%post -n %{realname}-%{soname} -p /sbin/ldconfig 63 +%postun -n %{realname}-%{soname} -p /sbin/ldconfig 64 65 -%post -n %{realname}-%{sonamecompc} 66 -/sbin/ldconfig 67 +%post -n %{realname}-%{sonamecompa} -p /sbin/ldconfig 68 +%postun -n %{realname}-%{sonamecompa} -p /sbin/ldconfig 69 70 -%postun -n %{realname}-%{sonamecompc} 71 -/sbin/ldconfig 72 +%post -n %{realname}-%{sonamecompb} -p /sbin/ldconfig 73 +%postun -n %{realname}-%{sonamecompb} -p /sbin/ldconfig 74 75 +%post -n %{realname}-%{sonamecompc} -p /sbin/ldconfig 76 +%postun -n %{realname}-%{sonamecompc} -p /sbin/ldconfig 77 78 %files 79 %defattr(-,root,root) 80 @@ -225,28 +212,28 @@ 81 %attr(0755,root,root) %{_bindir}/x264 82 83 %files -n %{realname}-%{soname} 84 -%defattr(-,root,root) 85 +%defattr(0644,root,root) 86 %attr(0755,root,root) %{_libdir}/libx264.so.%{soname} 87 88 %files -n %{realname}-%{sonamecompa} 89 -%defattr(-,root,root) 90 +%defattr(0644,root,root) 91 %attr(0755,root,root) %{_libdir}/libx264.so.%{sonamecompa} 92 93 %files -n %{realname}-%{sonamecompb} 94 -%defattr(-,root,root) 95 +%defattr(0644,root,root) 96 %attr(0755,root,root) %{_libdir}/libx264.so.%{sonamecompb} 97 98 %files -n %{realname}-%{sonamecompc} 99 -%defattr(-,root,root) 100 +%defattr(0644,root,root) 101 %attr(0755,root,root) %{_libdir}/libx264.so.%{sonamecompc} 102 103 %files -n %{realname}-devel 104 -%defattr(-,root,root) 105 +%defattr(0644,root,root) 106 %{_libdir}/pkgconfig/x264.pc 107 %{_includedir}/x264.h 108 %{_libdir}/libx264.so 109 %{_libdir}/libx264.a 110 111 %changelog 112 -* Sun Sep 30 2007 Carsten Schoene <cs@linux-administrator.com> 113 -- import for SLE_10 build 114 +* Mon Jun 29 2009 Carsten Schoene <cs@linux-administrator.com> - 20090627-15 115 +- import for SLE_11 build 116
	Deleted	x264-snapshot-20081218-2245.tar.bz2/.git/objects/pack/pack-af7017097b709ffa675014eec71010e10908193f.idx ^
	Deleted	x264-snapshot-20081218-2245.tar.bz2/.git/objects/pack/pack-af7017097b709ffa675014eec71010e10908193f.pack ^
	Changed	x264-snapshot-20090119-2245.tar.bz2/.git/index ^
	Added	x264-snapshot-20090119-2245.tar.bz2/.git/objects/pack/pack-7e284d41e3d870f5e6bd5c2ad5b36d1e4f0910d9.idx ^
	Added	x264-snapshot-20090119-2245.tar.bz2/.git/objects/pack/pack-7e284d41e3d870f5e6bd5c2ad5b36d1e4f0910d9.pack ^
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/.git/refs/heads/master ^
@@ -1 +1 @@ -5f8a1490eb0bc2a934c34bc8307bfdc1ade6a92d +a48d1d0a2ad590d041b79bb152ed47d00451ba8d
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/.git/refs/heads/origin ^
@@ -1 +1 @@ -5f8a1490eb0bc2a934c34bc8307bfdc1ade6a92d +a48d1d0a2ad590d041b79bb152ed47d00451ba8d
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/bs.h ^
@@ -50,10 +50,18 @@ int i_bits_encoded; /* RD only / } bs_t; -extern const vlc_t x264_coeff_token[5][174]; +typedef struct +{ + int last; + int16_t level[16]; + uint8_t run[16]; +} x264_run_level_t; + +extern const vlc_t x264_coeff0_token[5]; +extern const vlc_t x264_coeff_token[5][164]; extern const vlc_t x264_total_zeros[15][16]; extern const vlc_t x264_total_zeros_dc[3][4]; -extern const vlc_t x264_run_before[7][15]; +extern const vlc_t x264_run_before[7][16]; / A larger level table size theoretically could help a bit at extremely * high bitrates, but the cost in cache is usually too high for it to be
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/cabac.c ^
@@ -742,41 +742,6 @@ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; -static const uint8_t x264_cabac_probability[128] = -{ - FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781), - FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730), - FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667), - FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590), - FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495), - FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378), - FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234), - FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057), - FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838), - FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569), - FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237), - FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828), - FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325), - FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705), - FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941), - FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000), - FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276), - FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472), - FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818), - FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288), - FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857), - FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508), - FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224), - FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994), - FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807), - FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655), - FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532), - FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432), - FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350), - FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284), - FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231), - FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187) -}; /* -ln2(probability) */ #define F(a,b) {FIX8(a),FIX8(b)} const uint16_t x264_cabac_entropy[128][2] =
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/common.c ^
@@ -247,7 +247,7 @@ if( b_error ) { char buf = strdup(value); - char tok, saveptr, init; + char tok, UNUSED saveptr, *init; b_error = 0; p->cpu = 0; for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/common.h ^
@@ -338,6 +338,7 @@ int i_max_ref1; int i_delay; /* Number of frames buffered for B reordering / int b_have_lowres; / Whether 1/2 resolution luma planes are being used / + int b_have_sub8x8_esa; } frames; / current frame being encoded / @@ -604,6 +605,8 @@ } stat; + void scratch_buffer; /* for any temporary storage that doesn't want repeated malloc / + / CPU functions dependents */ x264_predict_t predict_16x16[4+3]; x264_predict_t predict_8x8c[4+3];
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/cpu.c ^
@@ -33,6 +33,11 @@ #include <sys/types.h> #include <sys/sysctl.h> #endif +#ifdef SYS_OPENBSD +#include <sys/param.h> +#include <sys/sysctl.h> +#include <machine/cpu.h> +#endif #include "common.h" #include "cpu.h" @@ -54,6 +59,7 @@ {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, {"SSEMisalign", X264_CPU_SSE_MISALIGN}, + {"LZCNT", X264_CPU_LZCNT}, {"Slow_mod4_stack", X264_CPU_STACK_MOD4}, {"", 0}, }; @@ -117,6 +123,7 @@ { cpu \|= X264_CPU_SSE2_IS_FAST; cpu \|= X264_CPU_SSE_MISALIGN; + cpu \|= X264_CPU_LZCNT; x264_cpu_mask_misalign_sse(); } else @@ -192,13 +199,17 @@ #elif defined( ARCH_PPC ) -#ifdef SYS_MACOSX +#if defined(SYS_MACOSX) \|\| defined(SYS_OPENBSD) #include <sys/sysctl.h> uint32_t x264_cpu_detect( void ) { /* Thank you VLC */ uint32_t cpu = 0; +#ifdef SYS_OPENBSD + int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC }; +#else int selectors[2] = { CTL_HW, HW_VECTORUNIT }; +#endif int has_altivec = 0; size_t length = sizeof( has_altivec ); int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 ); @@ -292,10 +303,15 @@ get_system_info( &info ); return info.cpu_count; -#elif defined(SYS_MACOSX) \|\| defined(SYS_FREEBSD) +#elif defined(SYS_MACOSX) \|\| defined(SYS_FREEBSD) \|\| defined(SYS_OPENBSD) int numberOfCPUs; size_t length = sizeof( numberOfCPUs ); +#ifdef SYS_OPENBSD + int mib[2] = { CTL_HW, HW_NCPU }; + if( sysctl(mib, 2, &numberOfCPUs, &length, NULL, 0) ) +#else if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) ) +#endif { numberOfCPUs = 1; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/cpu.h ^
@@ -35,7 +35,8 @@ * This applies only to x86_32, since other architectures that need alignment * also have ABIs that ensure aligned stack. / #if defined(ARCH_X86) && defined(HAVE_MMX) -void x264_stack_align( void (func)(x264_t), x264_t arg ); +int x264_stack_align( void (func)(x264_t), x264_t arg ); +#define x264_stack_align(func,arg) x264_stack_align((void ()(x264_t*))func,arg) #else #define x264_stack_align(func,arg) func(arg) #endif
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/frame.c ^
@@ -99,7 +99,7 @@ if( h->param.analyse.i_me_method >= X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[3], - 2 * frame->i_stride[0] * (frame->i_lines[0] + 2i_padv) sizeof(uint16_t) ); + frame->i_stride[0] * (frame->i_lines[0] + 2i_padv) sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); frame->integral = (uint16_t)frame->buffer[3] + frame->i_stride[0] i_padv + PADH; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/macroblock.c ^
@@ -23,6 +23,7 @@ ****************************************************************************/ #include "common.h" +#include "encoder/me.h" void x264_mb_predict_mv( x264_t h, int i_list, int idx, int i_width, int16_t mvp[2] ) { @@ -140,8 +141,8 @@ int16_t mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; if( i_refa == -2 \|\| i_refb == -2 \|\| - ( i_refa == 0 && (uint32_t)mv_a == 0 ) \|\| - ( i_refb == 0 && (uint32_t)mv_b == 0 ) ) + !( i_refa \| (uint32_t)mv_a ) \|\| + !( i_refb \| (uint32_t)mv_b ) ) { (uint32_t)mv = 0; } @@ -730,17 +731,9 @@ } else / B_x / { - int b_list0[2]; - int b_list1[2]; + const uint8_t b_list0 = x264_mb_type_list_table[h->mb.i_type][0]; + const uint8_t b_list1 = x264_mb_type_list_table[h->mb.i_type][1]; - int i; - - / init ref list utilisations / - for( i = 0; i < 2; i++ ) - { - b_list0[i] = x264_mb_type_list0_table[h->mb.i_type][i]; - b_list1[i] = x264_mb_type_list1_table[h->mb.i_type][i]; - } if( h->mb.i_partition == D_16x16 ) { if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 ); @@ -846,6 +839,13 @@ h->mb.i_neighbour4[15] = h->mb.i_neighbour8[3] = MB_LEFT\|MB_TOP\|MB_TOPLEFT; + int buf_hpel = (h->param.i_width+48) sizeof(int16_t); + int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); + int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); + int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) * + ((me_range2+18) sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); + CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) ); + return 0; fail: return -1; } @@ -871,6 +871,7 @@ x264_free( h->mb.skipbp ); x264_free( h->mb.cbp ); x264_free( h->mb.qp ); + x264_free( h->scratch_buffer ); } void x264_macroblock_slice_init( x264_t *h ) {
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/macroblock.h ^
@@ -91,31 +91,18 @@ B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1, B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP }; -static const uint8_t x264_mb_type_list0_table[X264_MBTYPE_MAX][2] = +static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] = { - {0,0}, {0,0}, {0,0}, {0,0}, /* INTRA / - {1,1}, / P_L0 / - {0,0}, / P_8x8 / - {1,1}, / P_SKIP / - {0,0}, / B_DIRECT / - {1,1}, {1,0}, {1,1}, / B_L0_* / - {0,1}, {0,0}, {0,1}, / B_L1_* / - {1,1}, {1,0}, {1,1}, / B_BI_* / - {0,0}, / B_8x8 / - {0,0} / B_SKIP / -}; -static const uint8_t x264_mb_type_list1_table[X264_MBTYPE_MAX][2] = -{ - {0,0}, {0,0}, {0,0}, {0,0}, / INTRA / - {0,0}, / P_L0 / - {0,0}, / P_8x8 / - {0,0}, / P_SKIP / - {0,0}, / B_DIRECT / - {0,0}, {0,1}, {0,1}, / B_L0_* / - {1,0}, {1,1}, {1,1}, / B_L1_* / - {1,0}, {1,1}, {1,1}, / B_BI_* / - {0,0}, / B_8x8 / - {0,0} / B_SKIP / + {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, / INTRA / + {{1,1},{0,0}}, / P_L0 / + {{0,0},{0,0}}, / P_8x8 / + {{1,1},{0,0}}, / P_SKIP / + {{0,0},{0,0}}, / B_DIRECT / + {{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}}, / B_L0_* / + {{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}}, / B_L1_* / + {{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}}, / B_BI_* / + {{0,0},{0,0}}, / B_8x8 / + {{0,0},{0,0}} / B_SKIP */ }; #define IS_SUB4x4(type) ( (type ==D_L0_4x4)\|\|(type ==D_L1_4x4)\|\|(type ==D_BI_4x4))
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/mc.c ^
@@ -132,9 +132,8 @@ #define TAPFILTER(pix, d) ((pix)[x-2d] + (pix)[x+3d] - 5((pix)[x-d] + (pix)[x+2d]) + 20((pix)[x] + (pix)[x+d])) static void hpel_filter( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, - int stride, int width, int height ) + int stride, int width, int height, int16_t buf ) { - int16_t buf = x264_malloc((width+5)sizeof(int16_t)); int x, y; for( y=0; y<height; y++ ) { @@ -153,7 +152,6 @@ dstc += stride; src += stride; } - x264_free(buf); } static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; @@ -269,6 +267,42 @@ memset( dst, 0, n ); } +static void integral_init4h( uint16_t sum, uint8_t pix, int stride ) +{ + int x, v = pix[0]+pix[1]+pix[2]+pix[3]; + for( x=0; x<stride-4; x++ ) + { + sum[x] = v + sum[x-stride]; + v += pix[x+4] - pix[x]; + } +} + +static void integral_init8h( uint16_t sum, uint8_t pix, int stride ) +{ + int x, v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7]; + for( x=0; x<stride-8; x++ ) + { + sum[x] = v + sum[x-stride]; + v += pix[x+8] - pix[x]; + } +} + +static void integral_init4v( uint16_t sum8, uint16_t sum4, int stride ) +{ + int x; + for( x=0; x<stride-8; x++ ) + sum4[x] = sum8[x+4stride] - sum8[x]; + for( x=0; x<stride-8; x++ ) + sum8[x] = sum8[x+8stride] + sum8[x+8stride+4] - sum8[x] - sum8[x+4]; +} + +static void integral_init8v( uint16_t sum8, int stride ) +{ + int x; + for( x=0; x<stride-8; x++ ) + sum8[x] = sum8[x+8stride] - sum8[x]; +} + void x264_frame_init_lowres( x264_t h, x264_frame_t frame ) { uint8_t src = frame->plane[0]; @@ -353,6 +387,11 @@ pf->memzero_aligned = memzero_aligned; pf->frame_init_lowres_core = frame_init_lowres_core; + pf->integral_init4h = integral_init4h; + pf->integral_init8h = integral_init8h; + pf->integral_init4v = integral_init4v; + pf->integral_init8v = integral_init8v; + #ifdef HAVE_MMX x264_mc_init_mmx( cpu, pf ); #endif @@ -370,7 +409,7 @@ int start = (mb_y16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 int height = ((b_end ? frame->i_lines[0] : mb_y16) >> b_interlaced) + 8; int offs = startstride - 8; // buffer = 3 for 6tap, aligned to 8 for simd - int x, y; + int y; if( mb_y & b_interlaced ) return; @@ -382,7 +421,8 @@ frame->filtered[2] + offs, frame->filtered[3] + offs, frame->plane[0] + offs, - stride, width + 16, height - start ); + stride, width + 16, height - start, + h->scratch_buffer ); } / generate integral image: @@ -398,23 +438,25 @@ start = -PADV; } if( b_end ) - height += PADV-8; + height += PADV-9; for( y = start; y < height; y++ ) { - uint8_t ref = frame->plane[0] + y stride - PADH; - uint16_t line = frame->integral + (y+1) stride - PADH + 1; - uint16_t v = line[0] = 0; - for( x = 1; x < stride-1; x++ ) - line[x] = v += ref[x] + line[x-stride] - line[x-stride-1]; - line -= 8stride; - if( y >= 9-PADV ) + uint8_t pix = frame->plane[0] + y * stride - PADH; + uint16_t sum8 = frame->integral + (y+1) stride - PADH; + uint16_t sum4; + if( h->frames.b_have_sub8x8_esa ) + { + h->mc.integral_init4h( sum8, pix, stride ); + sum8 -= 8stride; + sum4 = sum8 + stride * (frame->i_lines[0] + PADV2); + if( y >= 8-PADV ) + h->mc.integral_init4v( sum8, sum4, stride ); + } + else { - uint16_t sum4 = line + stride * (frame->i_lines[0] + PADV2); - for( x = 1; x < stride-8; x++, line++, sum4++ ) - { - sum4[0] = line[4+4stride] - line[4] - line[4stride] + line[0]; - line[0] += line[8+8stride] - line[8] - line[8stride]; - } + h->mc.integral_init8h( sum8, pix, stride ); + if( y >= 8-PADV ) + h->mc.integral_init8v( sum8-8stride, stride ); } } }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/mc.h ^
@@ -55,7 +55,7 @@ uint8_t src, int i_src, int w, int h); void (hpel_filter)( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, - int i_stride, int i_width, int i_height ); + int i_stride, int i_width, int i_height, int16_t buf ); / prefetch the next few macroblocks of fenc or fdec / void (prefetch_fenc)( uint8_t pix_y, int stride_y, @@ -66,6 +66,12 @@ void (memcpy_aligned)( void dst, const void src, size_t n ); void (memzero_aligned)( void dst, int n ); + / successive elimination prefilter / + void (integral_init4h)( uint16_t sum, uint8_t pix, int stride ); + void (integral_init8h)( uint16_t sum, uint8_t pix, int stride ); + void (integral_init4v)( uint16_t sum8, uint16_t sum4, int stride ); + void (integral_init8v)( uint16_t sum8, int stride ); + void (frame_init_lowres_core)( uint8_t src0, uint8_t dst0, uint8_t dsth, uint8_t dstv, uint8_t dstc, int src_stride, int dst_stride, int width, int height ); } x264_mc_functions_t;
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/osdep.h ^
@@ -47,6 +47,7 @@ #define fseek _fseeki64 #define ftell _ftelli64 #define isfinite _finite +#define strtok_r strtok_s #define _CRT_SECURE_NO_DEPRECATE #define X264_VERSION "" // no configure script for msvc #endif @@ -169,7 +170,7 @@ } #endif -#ifdef __GNUC__ +#if defined(__GNUC__) && (__GNUC__ > 3 \|\| __GNUC__ == 3 && __GNUC_MINOR__ > 3) #define x264_clz(x) __builtin_clz(x) #else static int ALWAYS_INLINE x264_clz( uint32_t x )
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/pixel.c ^
@@ -140,7 +140,7 @@ * pixel_var_wxh ***************************************************************************/ #define PIXEL_VAR_C( name, w, shift ) \ -static int name( uint8_t pix, int i_stride, uint32_t sad ) \ +static int name( uint8_t pix, int i_stride ) \ { \ uint32_t var = 0, sum = 0, sqr = 0; \ int x, y; \ @@ -154,7 +154,6 @@ pix += i_stride; \ } \ var = sqr - (sum * sum >> shift); \ - sad = sum; \ return var; \ } @@ -489,12 +488,12 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t pf, uint8_t pix1, int stride1, uint8_t pix2, int stride2, - int width, int height ) + int width, int height, void buf ) { int x, y, z; float ssim = 0.0; - int (sum0)[4] = x264_malloc(4 * (width/4+3) * sizeof(int)); - int (sum1)[4] = x264_malloc(4 (width/4+3) * sizeof(int)); + int (sum0)[4] = buf; + int (sum1)[4] = sum0 + width/4+3; width >>= 2; height >>= 2; z = 0; @@ -509,8 +508,6 @@ for( x = 0; x < width-1; x += 4 ) ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) ); } - x264_free(sum0); - x264_free(sum1); return ssim; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/pixel.h ^
@@ -74,7 +74,7 @@ x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp / - int (var[4])( uint8_t pix, int stride, uint32_t sad ); + int (var[4])( uint8_t pix, int stride ); uint64_t (hadamard_ac[4])( uint8_t pix, int stride ); void (ssim_4x4x2_core)( const uint8_t pix1, int stride1, @@ -104,6 +104,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t pixf ); int64_t x264_pixel_ssd_wxh( x264_pixel_function_t pf, uint8_t pix1, int i_pix1, uint8_t pix2, int i_pix2, int i_width, int i_height ); -float x264_pixel_ssim_wxh( x264_pixel_function_t pf, uint8_t pix1, int i_pix1, uint8_t pix2, int i_pix2, int i_width, int i_height ); +float x264_pixel_ssim_wxh( x264_pixel_function_t pf, uint8_t pix1, int i_pix1, uint8_t pix2, int i_pix2, int i_width, int i_height, void *buf ); #endif
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/dct.c ^
@@ -21,10 +21,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. *****************************************************************************/ -#ifdef SYS_LINUX -#include <altivec.h> -#endif - #include "common/common.h" #include "ppccommon.h"
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/deblock.c ^
@@ -18,10 +18,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. *****************************************************************************/ -#if defined SYS_LINUX -#include <altivec.h> -#endif - #include "common/common.h" #include "ppccommon.h"
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/mc.c ^
@@ -27,10 +27,6 @@ #include <stdint.h> #include <stdarg.h> -#ifdef SYS_LINUX -#include <altivec.h> -#endif - #include "x264.h" #include "common/common.h" #include "common/mc.h" @@ -545,7 +541,7 @@ } void x264_hpel_filter_altivec( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, - int i_stride, int i_width, int i_height ) + int i_stride, int i_width, int i_height, int16_t *buf ) { int x, y; @@ -563,7 +559,7 @@ vec_u16_t twov, fourv, fivev, sixv; vec_s16_t sixteenv, thirtytwov; - vect_ushort_u temp_u; + vec_u16_u temp_u; temp_u.s[0]=2; twov = vec_splat( temp_u.v, 0 );
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/pixel.c ^
@@ -21,10 +21,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. *****************************************************************************/ -#ifdef SYS_LINUX -#include <altivec.h> -#endif - #include "common/common.h" #include "ppccommon.h"
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/ppccommon.h ^
@@ -18,6 +18,10 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ***************************************************************************/ +#ifdef HAVE_ALTIVEC_H +#include <altivec.h> +#endif + /********************************************************************* * For constant vectors, use parentheses on OS X and braces on Linux ********************************************************************/ @@ -38,19 +42,24 @@ #define vec_s32_t vector signed int typedef union { - unsigned int s[4]; - vector unsigned int v; -} vect_int_u; + uint32_t s[4]; + vec_u32_t v; +} vec_u32_u; + +typedef union { + uint16_t s[8]; + vec_u16_t v; +} vec_u16_u; typedef union { - unsigned short s[8]; - vector unsigned short v; -} vect_ushort_u; + int16_t s[8]; + vec_s16_t v; +} vec_s16_u; typedef union { - signed short s[8]; - vector signed short v; -} vect_sshort_u; + uint8_t s[16]; + vec_u8_t v; +} vec_u8_u; /********************************************************************* * Null vector
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/predict.c ^
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: h264 encoder ***************************************************************************** - * Copyright (C) 2007-2008 Guillaume Poirier <gpoirier@mplayerhq.hu> + * Copyright (C) 2007-2009 Guillaume Poirier <gpoirier@mplayerhq.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,15 +18,65 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ****************************************************************************/ -#ifdef SYS_LINUX -#include <altivec.h> -#endif - #include "common/common.h" #include "predict.h" #include "pixel.h" #include "ppccommon.h" +static void predict_8x8c_p_altivec( uint8_t src ) +{ + int i; + int a, b, c; + int H = 0; + int V = 0; + int i00; + + for( i = 0; i < 4; i++ ) + { + H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] ); + V += ( i + 1 ) * ( src[-1 +(i+4)FDEC_STRIDE] - src[-1+(2-i)FDEC_STRIDE] ); + } + + a = 16 * ( src[-1+7FDEC_STRIDE] + src[7 - FDEC_STRIDE] ); + b = ( 17 H + 16 ) >> 5; + c = ( 17 * V + 16 ) >> 5; + i00 = a -3b -3c + 16; + + vec_s16_u i00_u, b_u, c_u; + i00_u.s[0] = i00; + b_u.s[0] = b; + c_u.s[0] = c; + + vec_u16_t val5_v = vec_splat_u16(5); + vec_s16_t i00_v, b_v, c_v; + i00_v = vec_splat(i00_u.v, 0); + b_v = vec_splat(b_u.v, 0); + c_v = vec_splat(c_u.v, 0); + + vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7); + vec_s32_t mule_b_v = vec_mule(induc_v, b_v); + vec_s32_t mulo_b_v = vec_mulo(induc_v, b_v); + vec_s16_t mul_b_induc0_v = vec_pack(vec_mergeh(mule_b_v, mulo_b_v), vec_mergel(mule_b_v, mulo_b_v)); + vec_s16_t add_i0_b_0v = vec_adds(i00_v, mul_b_induc0_v); + + PREP_STORE8; + + for( i = 0; i < 8; ++i ) + { + vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v); + vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_0_v); + VEC_STORE8(com_sat_v, &src[0]); + src += FDEC_STRIDE; + add_i0_b_0v = vec_adds(add_i0_b_0v, c_v); + + } +} + + +/**************************************************************************** + * 16x16 prediction for intra luma block + ***************************************************************************/ + static void predict_16x16_p_altivec( uint8_t src ) { int16_t a, b, c, i; @@ -45,7 +95,7 @@ c = ( 5 * V + 32 ) >> 6; i00 = a - b * 7 - c * 7 + 16; - vect_sshort_u i00_u, b_u, c_u; + vec_s16_u i00_u, b_u, c_u; i00_u.s[0] = i00; b_u.s[0] = b; c_u.s[0] = c; @@ -72,16 +122,122 @@ vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v); vec_st( com_sat_v, 0, &src[0]); src += FDEC_STRIDE; - i00 += c; add_i0_b_0v = vec_adds(add_i0_b_0v, c_v); add_i0_b_8v = vec_adds(add_i0_b_8v, c_v); } } +#define PREDICT_16x16_DC_ALTIVEC(v) \ +for (i=0; i<16; i+=2) \ +{ \ + vec_st(v, 0, src); \ + vec_st(v, FDEC_STRIDE, src); \ + src += FDEC_STRIDE2; \ +} + +static void predict_16x16_dc_altivec( uint8_t src ) +{ + uint32_t dc = 0; + int i; + + for( i = 0; i < 16; i++ ) + { + dc += src[-1 + i * FDEC_STRIDE]; + dc += src[i - FDEC_STRIDE]; + } + vec_u8_u v ; v.s[0] = (( dc + 16 ) >> 5); + vec_u8_t bc_v = vec_splat(v.v, 0); + + PREDICT_16x16_DC_ALTIVEC(bc_v); +} + +static void predict_16x16_dc_left_altivec( uint8_t src ) +{ + uint32_t dc = 0; + int i; + + for( i = 0; i < 16; i++ ) + { + dc += src[-1 + i FDEC_STRIDE]; + } + vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4); + vec_u8_t bc_v = vec_splat(v.v, 0); + + PREDICT_16x16_DC_ALTIVEC(bc_v); +} + +static void predict_16x16_dc_top_altivec( uint8_t src ) +{ + uint32_t dc = 0; + int i; + + for( i = 0; i < 16; i++ ) + { + dc += src[i - FDEC_STRIDE]; + } + vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4); + vec_u8_t bc_v = vec_splat(v.v, 0); + + PREDICT_16x16_DC_ALTIVEC(bc_v); +} + +static void predict_16x16_dc_128_altivec( uint8_t src ) +{ + int i; + /* test if generating the constant is faster than loading it. + vector unsigned int bc_v = (vector unsigned int)CV(0x80808080, 0x80808080, 0x80808080, 0x80808080); + / + vec_u8_t bc_v = vec_vslb((vec_u8_t)vec_splat_u8(1),(vec_u8_t)vec_splat_u8(7)); + PREDICT_16x16_DC_ALTIVEC(bc_v); +} + +static void predict_16x16_h_altivec( uint8_t src ) +{ + int i; + + for( i = 0; i < 16; i++ ) + { + vec_u8_t v = vec_ld(-1, src); + vec_u8_t v_v = vec_splat(v, 15); + vec_st(v_v, 0, src); + + src += FDEC_STRIDE; + } +} + +static void predict_16x16_v_altivec( uint8_t src ) +{ + vec_u32_u v; + v.s[0] = (uint32_t)&src[ 0-FDEC_STRIDE]; + v.s[1] = (uint32_t)&src[ 4-FDEC_STRIDE]; + v.s[2] = (uint32_t)&src[ 8-FDEC_STRIDE]; + v.s[3] = (uint32_t)&src[12-FDEC_STRIDE]; + + int i; + + for( i = 0; i < 16; i++ ) + { + vec_st(v.v, 0, (uint32_t)src); + src += FDEC_STRIDE; + } +} + + /**************************************************************************** * Exported functions: ****************************************************************************/ void x264_predict_16x16_init_altivec( x264_predict_t pf[7] ) { - pf[I_PRED_16x16_P] = predict_16x16_p_altivec; + pf[I_PRED_16x16_V ] = predict_16x16_v_altivec; + pf[I_PRED_16x16_H ] = predict_16x16_h_altivec; + pf[I_PRED_16x16_DC] = predict_16x16_dc_altivec; + pf[I_PRED_16x16_P ] = predict_16x16_p_altivec; + pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec; + pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec; + pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec; +} + +void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] ) +{ + pf[I_PRED_CHROMA_P] = predict_8x8c_p_altivec; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/predict.h ^
@@ -22,5 +22,6 @@ #define X264_PPC_PREDICT_H void x264_predict_16x16_init_altivec ( x264_predict_t pf[7] ); +void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] ); #endif /* X264_PPC_PREDICT_H */
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/ppc/quant.c ^
@@ -18,10 +18,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. *****************************************************************************/ -#if defined SYS_LINUX -#include <altivec.h> -#endif - #include "common/common.h" #include "ppccommon.h" #include "quant.h" @@ -75,7 +71,7 @@ vec_s16_t temp1v, temp2v; - vect_int_u qbits_u; + vec_u32_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); @@ -129,15 +125,15 @@ vec_u16_t mfv; vec_u16_t biasv; - vect_ushort_u mf_u; + vec_u16_u mf_u; mf_u.s[0]=mf; mfv = vec_splat( mf_u.v, 0 ); - vect_int_u qbits_u; + vec_u32_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); - vect_ushort_u bias_u; + vec_u16_u bias_u; bias_u.s[0]=bias; biasv = vec_splat(bias_u.v, 0); @@ -177,15 +173,15 @@ vec_u16_t mfv; vec_u16_t biasv; - vect_ushort_u mf_u; + vec_u16_u mf_u; mf_u.s[0]=mf; mfv = vec_splat( mf_u.v, 0 ); - vect_int_u qbits_u; + vec_u32_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); - vect_ushort_u bias_u; + vec_u16_u bias_u; bias_u.s[0]=bias; biasv = vec_splat(bias_u.v, 0); @@ -213,7 +209,7 @@ vec_s16_t temp1v, temp2v; - vect_int_u qbits_u; + vec_u32_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); @@ -282,7 +278,7 @@ if( i_qbits >= 0 ) { vec_u16_t i_qbitsv; - vect_ushort_u qbits_u; + vec_u16_u qbits_u; qbits_u.s[0]=i_qbits; i_qbitsv = vec_splat(qbits_u.v, 0); @@ -294,17 +290,17 @@ const int f = 1 << (-i_qbits-1); vec_s32_t fv; - vect_int_u f_u; + vec_u32_u f_u; f_u.s[0]=f; fv = (vec_s32_t)vec_splat(f_u.v, 0); vec_u32_t i_qbitsv; - vect_int_u qbits_u; + vec_u32_u qbits_u; qbits_u.s[0]=-i_qbits; i_qbitsv = vec_splat(qbits_u.v, 0); vec_u32_t sixteenv; - vect_int_u sixteen_u; + vec_u32_u sixteen_u; sixteen_u.s[0]=16; sixteenv = vec_splat(sixteen_u.v, 0); @@ -329,7 +325,7 @@ if( i_qbits >= 0 ) { vec_u16_t i_qbitsv; - vect_ushort_u qbits_u; + vec_u16_u qbits_u; qbits_u.s[0]=i_qbits; i_qbitsv = vec_splat(qbits_u.v, 0); @@ -341,17 +337,17 @@ const int f = 1 << (-i_qbits-1); vec_s32_t fv; - vect_int_u f_u; + vec_u32_u f_u; f_u.s[0]=f; fv = (vec_s32_t)vec_splat(f_u.v, 0); vec_u32_t i_qbitsv; - vect_int_u qbits_u; + vec_u32_u qbits_u; qbits_u.s[0]=-i_qbits; i_qbitsv = vec_splat(qbits_u.v, 0); vec_u32_t sixteenv; - vect_int_u sixteen_u; + vec_u32_u sixteen_u; sixteen_u.s[0]=16; sixteenv = vec_splat(sixteen_u.v, 0);
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/predict.c ^
@@ -786,6 +786,13 @@ #ifdef HAVE_MMX x264_predict_8x8c_init_mmx( cpu, pf ); #endif + +#ifdef ARCH_PPC + if( cpu&X264_CPU_ALTIVEC ) + { + x264_predict_8x8c_init_altivec( pf ); + } +#endif } void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] )
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/quant.c ^
@@ -273,6 +273,27 @@ return x264_coeff_last_internal( l, 64 ); } +#define level_run(num)\ +static int x264_coeff_level_run##num( int16_t dct, x264_run_level_t runlevel )\ +{\ + int i_last = runlevel->last = x264_coeff_last##num(dct);\ + int i_total = 0;\ + do\ + {\ + int r = 0;\ + runlevel->level[i_total] = dct[i_last];\ + while( --i_last >= 0 && dct[i_last] == 0 )\ + r++;\ + runlevel->run[i_total++] = r;\ + } while( i_last >= 0 );\ + return i_total;\ +} + +level_run(4) +level_run(15) +level_run(16) + + void x264_quant_init( x264_t h, int cpu, x264_quant_function_t pf ) { pf->quant_8x8 = quant_8x8; @@ -293,6 +314,9 @@ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64; + pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15; + pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16; #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) @@ -323,8 +347,16 @@ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext; + pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext; #endif pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext; + pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt; + pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt; + } } if( cpu&X264_CPU_SSE2 ) @@ -347,6 +379,16 @@ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; + } } if( cpu&X264_CPU_SSSE3 ) @@ -375,4 +417,6 @@ #endif pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4]; pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC]; + pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4]; + pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[ DCT_LUMA_AC]; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/quant.h ^
@@ -40,6 +40,7 @@ int (decimate_score16)( int16_t dct ); int (decimate_score64)( int16_t dct ); int (coeff_last[6])( int16_t dct ); + int (coeff_level_run[5])( int16_t dct, x264_run_level_t runlevel ); } x264_quant_function_t; void x264_quant_init( x264_t h, int cpu, x264_quant_function_t *pf );
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/vlc.c ^
@@ -21,15 +21,19 @@ #include "common.h" #define MKVLC( a, b ) { a, b } -const vlc_t x264_coeff_token[5][174] = +const vlc_t x264_coeff0_token[5] = +{ + MKVLC( 0x1, 1 ), / str=1 / + MKVLC( 0x3, 2 ), / str=11 / + MKVLC( 0xf, 4 ), / str=1111 / + MKVLC( 0x3, 6 ), / str=000011 / + MKVLC( 0x1, 2 ) / str=01 / +}; + +const vlc_t x264_coeff_token[5][164] = { /* table 0 / { - MKVLC( 0x1, 1 ), / str=1 / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x5, 6 ), / str=000101 / MKVLC( 0x1, 2 ), / str=01 / MKVLC( 0x0, 0 ), / str= / @@ -113,11 +117,6 @@ / table 1 / { - MKVLC( 0x3, 2 ), / str=11 / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0xb, 6 ), / str=001011 / MKVLC( 0x2, 2 ), / str=10 / MKVLC( 0x0, 0 ), / str= / @@ -200,11 +199,6 @@ }, / table 2 / { - MKVLC( 0xf, 4 ), / str=1111 / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0xf, 6 ), / str=001111 / MKVLC( 0xe, 4 ), / str=1110 / MKVLC( 0x0, 0 ), / str= / @@ -288,11 +282,6 @@ / table 3 / { - MKVLC( 0x3, 6 ), / str=000011 / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 6 ), / str=000000 / MKVLC( 0x1, 6 ), / str=000001 / MKVLC( 0x0, 0 ), / str= / @@ -376,11 +365,6 @@ / table 4 / { - MKVLC( 0x1, 2 ), / str=01 / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x0, 0 ), / str= / - MKVLC( 0x7, 6 ), / str=000111 / MKVLC( 0x1, 1 ), / str=1 / MKVLC( 0x0, 0 ), / str= / @@ -762,7 +746,7 @@ }; / x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] / -const vlc_t x264_run_before[7][15] = +const vlc_t x264_run_before[7][16] = { { / i_zero_left 1 / MKVLC( 0x1, 1 ), / str=1 */
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/cabac-a.asm ^
@@ -31,21 +31,12 @@ cextern x264_cabac_transition cextern x264_cabac_renorm_shift -%macro DEF_TMP 16 - %rep 8 - %define t%1d r%9d - %define t%1b r%9b - %define t%1 r%9 - %rotate 1 - %endrep -%endmacro - ; t3 must be ecx, since it's used for shift. %ifdef ARCH_X86_64 - DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10 + DECLARE_REG_TMP 0,1,2,3,4,5,6,10 %define pointer resq %else - DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3 + DECLARE_REG_TMP 0,3,2,1,4,5,6,3 %define pointer resd %endif
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/mc-a.asm ^
@@ -41,27 +41,13 @@ ; implicit bipred only: ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 %ifdef ARCH_X86_64 - %define t0 r0 - %define t1 r1 - %define t2 r2 - %define t3 r3 - %define t4 r4 - %define t5 r5 - %define t6d r10d - %define t7d r11d + DECLARE_REG_TMP 0,1,2,3,4,5,10,11 %macro AVG_START 0 PROLOGUE 6,7 .height_loop: %endmacro %else - %define t0 r1 - %define t1 r2 - %define t2 r3 - %define t3 r4 - %define t4 r5 - %define t5 r6 - %define t6d r1d - %define t7d r2d + DECLARE_REG_TMP 1,2,3,4,5,6,1,2 %macro AVG_START 0 PROLOGUE 0,7 mov t0, r0m @@ -690,12 +676,11 @@ ; chroma MC ;============================================================================= - %define t0d eax - %define t0 rax + %define t0 rax %ifdef ARCH_X86_64 - %define t1d r10d + %define t1 r10 %else - %define t1d r1d + %define t1 r1 %endif %macro MC_CHROMA_START 0
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/mc-a2.asm ^
@@ -694,6 +694,104 @@ +;----------------------------------------------------------------------------- +; void x264_integral_init4h_sse4( uint16_t sum, uint8_t pix, int stride ) +;----------------------------------------------------------------------------- +cglobal x264_integral_init4h_sse4, 3,4 + lea r3, [r0+r22] + add r1, r2 + neg r2 + pxor m4, m4 +.loop: + movdqa m0, [r1+r2] + movdqu m1, [r1+r2+8] + mpsadbw m0, m4, 0 + mpsadbw m1, m4, 0 + paddw m0, [r0+r22] + paddw m1, [r0+r22+16] + movdqa [r3+r22 ], m0 + movdqa [r3+r22+16], m1 + add r2, 16 + jl .loop + REP_RET + +cglobal x264_integral_init8h_sse4, 3,4 + lea r3, [r0+r22] + add r1, r2 + neg r2 + pxor m4, m4 +.loop: + movdqa m0, [r1+r2] + movdqu m1, [r1+r2+8] + movdqa m2, m0 + movdqa m3, m1 + mpsadbw m0, m4, 0 + mpsadbw m1, m4, 0 + mpsadbw m2, m4, 4 + mpsadbw m3, m4, 4 + paddw m0, [r0+r22] + paddw m1, [r0+r22+16] + paddw m0, m2 + paddw m1, m3 + movdqa [r3+r22 ], m0 + movdqa [r3+r22+16], m1 + add r2, 16 + jl .loop + REP_RET + +%macro INTEGRAL_INIT 1 +;----------------------------------------------------------------------------- +; void x264_integral_init4v_mmx( uint16_t sum8, uint16_t sum4, int stride ) +;----------------------------------------------------------------------------- +cglobal x264_integral_init4v_%1, 3,5 + shl r2, 1 + add r0, r2 + add r1, r2 + lea r3, [r0+r24] + lea r4, [r0+r28] + neg r2 +.loop: + movu m0, [r0+r2+8] + mova m2, [r0+r2] + movu m1, [r4+r2+8] + paddw m0, m2 + paddw m1, [r4+r2] + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, mmsize + jl .loop + REP_RET + +;----------------------------------------------------------------------------- +; void x264_integral_init8v_mmx( uint16_t sum8, int stride ) +;----------------------------------------------------------------------------- +cglobal x264_integral_init8v_%1, 3,3 + shl r1, 1 + add r0, r1 + lea r2, [r0+r18] + neg r1 +.loop: + mova m0, [r2+r1] + mova m1, [r2+r1+mmsize] + psubw m0, [r0+r1] + psubw m1, [r0+r1+mmsize] + mova [r0+r1], m0 + mova [r0+r1+mmsize], m1 + add r1, 2*mmsize + jl .loop + REP_RET +%endmacro + +INIT_MMX +INTEGRAL_INIT mmx +INIT_XMM +INTEGRAL_INIT sse2 + + + %macro FILT8x4 7 mova %3, [r0+%7] mova %4, [r0+r5+%7]
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/mc-c.c ^
@@ -64,6 +64,12 @@ extern void x264_memcpy_aligned_sse2( void dst, const void * src, size_t n ); extern void x264_memzero_aligned_mmx( void * dst, int n ); extern void x264_memzero_aligned_sse2( void * dst, int n ); +extern void x264_integral_init4h_sse4( uint16_t sum, uint8_t pix, int stride ); +extern void x264_integral_init8h_sse4( uint16_t sum, uint8_t pix, int stride ); +extern void x264_integral_init4v_mmx( uint16_t sum8, uint16_t sum4, int stride ); +extern void x264_integral_init4v_sse2( uint16_t sum8, uint16_t sum4, int stride ); +extern void x264_integral_init8v_mmx( uint16_t sum8, int stride ); +extern void x264_integral_init8v_sse2( uint16_t sum8, int stride ); #define LOWRES(cpu) \ extern void x264_frame_init_lowres_core_##cpu( uint8_t src0, uint8_t dst0, uint8_t dsth, uint8_t dstv, uint8_t dstc,\ int src_stride, int dst_stride, int width, int height ); @@ -196,16 +202,14 @@ void x264_hpel_filter_h_##cpuh( uint8_t dst, uint8_t src, int width );\ void x264_sfence( void );\ static void x264_hpel_filter_##cpu( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src,\ - int stride, int width, int height )\ + int stride, int width, int height, int16_t buf )\ {\ - int16_t buf;\ int realign = (long)src & (align-1);\ src -= realign;\ dstv -= realign;\ dstc -= realign;\ dsth -= realign;\ width += realign;\ - buf = x264_malloc((width+16)sizeof(int16_t));\ while( height-- )\ {\ x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\ @@ -217,14 +221,13 @@ src += stride;\ }\ x264_sfence();\ - x264_free(buf);\ } HPEL(8, mmxext, mmxext, mmxext, mmxext) HPEL(16, sse2_amd, mmxext, mmxext, sse2) #ifdef ARCH_X86_64 -void x264_hpel_filter_sse2( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, int stride, int width, int height ); -void x264_hpel_filter_ssse3( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, int stride, int width, int height ); +void x264_hpel_filter_sse2( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, int stride, int width, int height, int16_t buf ); +void x264_hpel_filter_ssse3( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, int stride, int width, int height, int16_t buf ); #else HPEL(16, sse2, sse2, sse2, sse2) HPEL(16, ssse3, sse2, ssse3, ssse3) @@ -242,6 +245,8 @@ pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; pf->memcpy_aligned = x264_memcpy_aligned_mmx; pf->memzero_aligned = x264_memzero_aligned_mmx; + pf->integral_init4v = x264_integral_init4v_mmx; + pf->integral_init8v = x264_integral_init8v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; @@ -286,6 +291,8 @@ pf->memcpy_aligned = x264_memcpy_aligned_sse2; pf->memzero_aligned = x264_memzero_aligned_sse2; + pf->integral_init4v = x264_integral_init4v_sse2; + pf->integral_init8v = x264_integral_init8v_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; if( cpu&X264_CPU_SSE2_IS_SLOW ) @@ -331,4 +338,10 @@ pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; + + if( !(cpu&X264_CPU_SSE4) ) + return; + + pf->integral_init4h = x264_integral_init4h_sse4; + pf->integral_init8h = x264_integral_init8h_sse4; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/pixel-a.asm ^
@@ -230,20 +230,15 @@ pxor m6, m6 ; sum squared pxor m7, m7 ; zero %ifdef ARCH_X86_64 - %define t3d r3d + %define t3 r3 %else - %define t3d r2d + %define t3 r2 %endif %endmacro %macro VAR_END 1 -%if mmsize == 16 - movhlps m0, m5 - paddw m5, m0 -%endif - movifnidn r2d, r2m + HADDW m5, m7 movd r1d, m5 - movd [r2], m5 ; return sum imul r1d, r1d HADDD m6, m1 shr r1d, %1 @@ -258,27 +253,25 @@ mova m0, [r0] mova m1, m0 mova m3, [r0+%1] - mova m2, m0 - punpcklbw m0, m7 mova m4, m3 + punpcklbw m0, m7 punpckhbw m1, m7 %ifidn %1, r1 lea r0, [r0+%12] %else add r0, r1 %endif - punpckhbw m4, m7 - psadbw m2, m7 - paddw m5, m2 - mova m2, m3 punpcklbw m3, m7 + punpckhbw m4, m7 + paddw m5, m0 dec t3d - psadbw m2, m7 pmaddwd m0, m0 - paddw m5, m2 + paddw m5, m1 pmaddwd m1, m1 + paddw m5, m3 paddd m6, m0 pmaddwd m3, m3 + paddw m5, m4 paddd m6, m1 pmaddwd m4, m4 paddd m6, m3 @@ -287,7 +280,7 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_var_wxh_mmxext( uint8_t , int, int * ) +; int x264_pixel_var_wxh_mmxext( uint8_t , int ) ;----------------------------------------------------------------------------- INIT_MMX cglobal x264_pixel_var_16x16_mmxext, 2,3 @@ -315,13 +308,12 @@ lea r0, [r0+r12] mova m1, m0 punpcklbw m0, m7 - mova m2, m1 punpckhbw m1, m7 dec t3d + paddw m5, m0 + paddw m5, m1 pmaddwd m0, m0 pmaddwd m1, m1 - psadbw m2, m7 - paddw m5, m2 paddd m6, m0 paddd m6, m1 jnz .loop @@ -1036,15 +1028,13 @@ ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 - %define t0 r10 - %define t0d r10d + %define t0 r10 %else ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned SUB esp, 16 %define top_1d esp+8 %define left_1d esp - %define t0 r2 - %define t0d r2d + %define t0 r2 %endif call load_hadamard @@ -1076,17 +1066,11 @@ RET %ifdef ARCH_X86_64 - %define t0 r10 - %define t0d r10d - %define t2 r11 - %define t2w r11w - %define t2d r11d + %define t0 r10 + %define t2 r11 %else - %define t0 r0 - %define t0d r0d - %define t2 r2 - %define t2w r2w - %define t2d r2d + %define t0 r0 + %define t2 r2 %endif ;----------------------------------------------------------------------------- @@ -1739,10 +1723,10 @@ %macro ADS_START 1 ; unroll_size %ifdef ARCH_X86_64 - %define t0 r6 + %define t0 r6 mov r10, rsp %else - %define t0 r4 + %define t0 r4 mov rbp, rsp %endif mov r0d, r5m
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/pixel.h ^
@@ -67,8 +67,8 @@ DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -DECL_PIXELS( int, var, mmxext, ( uint8_t pix, int i_stride, uint32_t sad )) -DECL_PIXELS( int, var, sse2, ( uint8_t pix, int i_stride, uint32_t sad )) +DECL_PIXELS( int, var, mmxext, ( uint8_t pix, int i_stride )) +DECL_PIXELS( int, var, sse2, ( uint8_t pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/quant-a.asm ^
@@ -241,19 +241,9 @@ %endmacro %ifdef ARCH_X86_64 - %define t0 r4 - %define t0d r4d - %define t1 r3 - %define t1d r3d - %define t2 r2 - %define t2d r2d -%else - %define t0 r2 - %define t0d r2d - %define t1 r0 - %define t1d r0d - %define t2 r1 - %define t2d r1d + DECLARE_REG_TMP 4,3,2 +%else + DECLARE_REG_TMP 2,0,1 %endif %macro DEQUANT_START 2 @@ -672,9 +662,12 @@ DECIMATE8x8 sse2 DECIMATE8x8 ssse3 +;----------------------------------------------------------------------------- +; int x264_coeff_last( int16_t dct ) +;----------------------------------------------------------------------------- + %macro LAST_MASK_SSE2 2-3 movdqa xmm0, [%2+ 0] - pxor xmm2, xmm2 packsswb xmm0, [%2+16] pcmpeqb xmm0, xmm2 pmovmskb %1, xmm0 @@ -683,7 +676,6 @@ %macro LAST_MASK_MMX 3 movq mm0, [%2+ 0] movq mm1, [%2+16] - pxor mm2, mm2 packsswb mm0, [%2+ 8] packsswb mm1, [%2+24] pcmpeqb mm0, mm2 @@ -694,45 +686,60 @@ or %1, %3 %endmacro +%macro LAST_X86 3 + bsr %1, %2 +%endmacro + +%macro LAST_SSE4A 3 + lzcnt %1, %2 + xor %1, %3 +%endmacro + +%macro COEFF_LAST4 1 %ifdef ARCH_X86_64 -cglobal x264_coeff_last4_mmxext, 1,1 - bsr rax, [r0] +cglobal x264_coeff_last4_%1, 1,1 + LAST rax, [r0], 0x3f shr eax, 4 RET %else -cglobal x264_coeff_last4_mmxext, 0,3 +cglobal x264_coeff_last4_%1, 0,3 mov edx, r0m mov eax, [edx+4] xor ecx, ecx test eax, eax cmovz eax, [edx] setnz cl - bsr eax, eax + LAST eax, eax, 0x1f shr eax, 4 lea eax, [eax+ecx2] RET %endif +%endmacro + +%define LAST LAST_X86 +COEFF_LAST4 mmxext +%define LAST LAST_SSE4A +COEFF_LAST4 mmxext_lzcnt %macro COEFF_LAST 1 cglobal x264_coeff_last15_%1, 1,3 + pxor m2, m2 LAST_MASK r1d, r0-2, r2d xor r1d, 0xffff - bsr eax, r1d + LAST eax, r1d, 0x1f dec eax RET cglobal x264_coeff_last16_%1, 1,3 + pxor m2, m2 LAST_MASK r1d, r0, r2d xor r1d, 0xffff - bsr eax, r1d + LAST eax, r1d, 0x1f RET %ifndef ARCH_X86_64 -%ifidn %1, mmxext - cglobal x264_coeff_last64_%1, 1,5 -%else - cglobal x264_coeff_last64_%1, 1,4 -%endif +cglobal x264_coeff_last64_%1, 1, 5-mmsize/16 + pxor m2, m2 LAST_MASK r1d, r0, r4d LAST_MASK r2d, r0+32, r4d shl r2d, 16 @@ -744,17 +751,15 @@ not r1d xor r2d, -1 jne .secondhalf - bsr eax, r1d + LAST eax, r1d, 0x1f RET .secondhalf: - bsr eax, r2d + LAST eax, r2d, 0x1f add eax, 32 RET -%endif -%endmacro - -%ifdef ARCH_X86_64 - cglobal x264_coeff_last64_sse2, 1,4 +%else +cglobal x264_coeff_last64_%1, 1,4 + pxor m2, m2 LAST_MASK_SSE2 r1d, r0 LAST_MASK_SSE2 r2d, r0+32 LAST_MASK_SSE2 r3d, r0+64 @@ -766,13 +771,94 @@ shl r3, 32 or r1, r3 not r1 - bsr rax, r1 + LAST rax, r1, 0x3f RET %endif +%endmacro +%define LAST LAST_X86 %ifndef ARCH_X86_64 +INIT_MMX %define LAST_MASK LAST_MASK_MMX COEFF_LAST mmxext %endif +INIT_XMM %define LAST_MASK LAST_MASK_SSE2 COEFF_LAST sse2 +%define LAST LAST_SSE4A +COEFF_LAST sse2_lzcnt + +;----------------------------------------------------------------------------- +; int x264_coeff_level_run( int16_t dct, x264_run_level_t runlevel ) +;----------------------------------------------------------------------------- + +%macro LAST_MASK4_MMX 2-3 + movq mm0, [%2] + packsswb mm0, mm0 + pcmpeqb mm0, mm2 + pmovmskb %1, mm0 +%endmacro + +%macro LZCOUNT_X86 3 + bsr %1, %2 + xor %1, %3 +%endmacro + +%macro LZCOUNT_SSE4A 3 + lzcnt %1, %2 +%endmacro + +; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args +%ifdef ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3,4,5,6 +%else + DECLARE_REG_TMP 6,3,2,1,4,5,0 +%endif + +%macro COEFF_LEVELRUN 2 +cglobal x264_coeff_level_run%2_%1,0,7 + movifnidn t0d, r0m + movifnidn t1d, r1m + pxor m2, m2 + LAST_MASK t5d, t0-(%2&1)2, t4d + not t5d + shl t5d, 32-((%2+1)&~1) + mov t4d, %2-1 + LZCOUNT t3d, t5d, 0x1f + xor t6d, t6d + add t5d, t5d + sub t4d, t3d + shl t5d, t3b + mov [t1], t4d +.loop: + LZCOUNT t3d, t5d, 0x1f + mov t2w, [t0+t42] + mov [t1+t6 +36], t3b + mov [t1+t6*2+ 4], t2w + inc t3d + shl t5d, t3b + inc t6d + sub t4d, t3d + jge .loop + RET +%endmacro + +INIT_MMX +%define LZCOUNT LZCOUNT_X86 +%ifndef ARCH_X86_64 +%define LAST_MASK LAST_MASK_MMX +COEFF_LEVELRUN mmxext, 15 +COEFF_LEVELRUN mmxext, 16 +%endif +%define LAST_MASK LAST_MASK4_MMX +COEFF_LEVELRUN mmxext, 4 +INIT_XMM +%define LAST_MASK LAST_MASK_SSE2 +COEFF_LEVELRUN sse2, 15 +COEFF_LEVELRUN sse2, 16 +%define LZCOUNT LZCOUNT_SSE4A +COEFF_LEVELRUN sse2_lzcnt, 15 +COEFF_LEVELRUN sse2_lzcnt, 16 +INIT_MMX +%define LAST_MASK LAST_MASK4_MMX +COEFF_LEVELRUN mmxext_lzcnt, 4
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/quant.h ^
@@ -64,5 +64,17 @@ int x264_coeff_last15_sse2( int16_t dct ); int x264_coeff_last16_sse2( int16_t dct ); int x264_coeff_last64_sse2( int16_t dct ); +int x264_coeff_last4_mmxext_lzcnt( int16_t dct ); +int x264_coeff_last15_sse2_lzcnt( int16_t dct ); +int x264_coeff_last16_sse2_lzcnt( int16_t dct ); +int x264_coeff_last64_sse2_lzcnt( int16_t dct ); +int x264_coeff_level_run16_mmxext( int16_t dct, x264_run_level_t runlevel ); +int x264_coeff_level_run16_sse2( int16_t dct, x264_run_level_t runlevel ); +int x264_coeff_level_run16_sse2_lzcnt( int16_t dct, x264_run_level_t runlevel ); +int x264_coeff_level_run15_mmxext( int16_t dct, x264_run_level_t runlevel ); +int x264_coeff_level_run15_sse2( int16_t dct, x264_run_level_t runlevel ); +int x264_coeff_level_run15_sse2_lzcnt( int16_t dct, x264_run_level_t runlevel ); +int x264_coeff_level_run4_mmxext( int16_t dct, x264_run_level_t runlevel ); +int x264_coeff_level_run4_mmxext_lzcnt( int16_t dct, x264_run_level_t *runlevel ); #endif
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/common/x86/x86inc.asm ^
@@ -116,6 +116,29 @@ DECLARE_REG_SIZE di, dil DECLARE_REG_SIZE bp, bpl +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 + %ifdef ARCH_X86_64 %define gprsize 8 %else
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/configure ^
@@ -263,7 +263,7 @@ then ALTIVECFLAGS="$ALTIVECFLAGS -faltivec -fastf -mcpu=G4" else - ALTIVECFLAGS="$ALTIVECFLAGS -maltivec -mabi=altivec" + ALTIVECFLAGS="$ALTIVECFLAGS -maltivec -mabi=altivec -DHAVE_ALTIVEC_H" fi ;; sparc)
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/analyse.c ^
@@ -22,6 +22,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ****************************************************************************/ +#define _ISOC99_SOURCE #include <math.h> #include <limits.h> #ifndef _MSC_VER @@ -29,6 +30,7 @@ #endif #include "common/common.h" +#include "common/cpu.h" #include "macroblock.h" #include "me.h" #include "ratecontrol.h" @@ -77,6 +79,8 @@ int i_lambda2; int i_qp; int16_t p_cost_mv; + uint16_t p_cost_ref0; + uint16_t p_cost_ref1; int i_mbrd; @@ -168,6 +172,7 @@ static void x264_analyse_update_cache( x264_t h, x264_mb_analysis_t a ); uint16_t x264_cost_mv_fpel[52][4]; +uint16_t x264_cost_ref[52][3][33]; / initialize an array of lambdanbits for all possible mvs / static void x264_mb_analyse_load_costs( x264_t h, x264_mb_analysis_t a ) @@ -177,6 +182,7 @@ if( !p_cost_mv[a->i_qp] ) { + x264_emms(); /* could be faster, but isn't called many times / / factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp / p_cost_mv[a->i_qp] = x264_malloc( (442048 + 1) sizeof(int16_t) ); @@ -184,10 +190,15 @@ for( i = 0; i <= 242048; i++ ) { p_cost_mv[a->i_qp][-i] = - p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i ); + p_cost_mv[a->i_qp][i] = a->i_lambda * (log2f(i+1)2 + 0.718f + !!i) + .5f; } + for( i = 0; i < 3; i++ ) + for( j = 0; j < 33; j++ ) + x264_cost_ref[a->i_qp][i][j] = a->i_lambda bs_size_te( i, j ); } a->p_cost_mv = p_cost_mv[a->i_qp]; + a->p_cost_ref0 = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; + a->p_cost_ref1 = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; /* FIXME is this useful for all me methods? / if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] ) @@ -727,8 +738,9 @@ } else { + static const uint16_t cost_div_fix8[3] = {1024,512,341}; a->i_satd_i8x8 = COST_MAX; - i_cost = i_cost 4/(idx+1); + i_cost = (i_cost * cost_div_fix8[idx]) >> 8; } if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter(5+!!a->i_mbrd)/4 ) return; @@ -1037,7 +1049,7 @@ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)(m)->i_stride[0]]; #define REF_COST(list, ref) \ - (a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref )) + (a->p_cost_ref##list[ref]) static void x264_mb_analyse_inter_p16x16( x264_t h, x264_mb_analysis_t a ) { @@ -2464,11 +2476,7 @@ { if( !h->mb.b_direct_auto_write ) x264_mb_mc( h ); - if( h->mb.b_lossless ) - { - /* chance of skip is too small to bother / - } - else if( analysis.i_mbrd ) + if( analysis.i_mbrd ) { i_bskip_cost = ssd_mb( h ); / 6 = minimum cavlc cost of a non-skipped MB */
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/cabac.c ^
@@ -50,9 +50,7 @@ x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma ); if( h->mb.i_cbp_chroma == 0 ) - { x264_cabac_encode_decision_noup( cb, ctx2, 0 ); - } else { x264_cabac_encode_decision( cb, ctx2, 1 ); @@ -77,13 +75,9 @@ { int ctx = 0; if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 ) - { ctx++; - } if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 ) - { ctx++; - } x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 ); } @@ -130,18 +124,12 @@ { int ctx = 0; if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT ) - { ctx++; - } if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) - { ctx++; - } if( i_mb_type == B_DIRECT ) - { x264_cabac_encode_decision_noup( cb, 27+ctx, 0 ); - } else if( i_mb_type == B_8x8 ) { x264_cabac_encode_decision_noup( cb, 27+ctx, 1 ); @@ -210,18 +198,12 @@ static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t cb, int i_pred, int i_mode ) { if( i_pred == i_mode ) - { - / b_prev_intra4x4_pred_mode / x264_cabac_encode_decision( cb, 68, 1 ); - } else { - / b_prev_intra4x4_pred_mode / x264_cabac_encode_decision( cb, 68, 0 ); if( i_mode > i_pred ) - { i_mode--; - } x264_cabac_encode_decision( cb, 69, (i_mode )&0x01 ); x264_cabac_encode_decision( cb, 69, (i_mode >> 1)&0x01 ); x264_cabac_encode_decision( cb, 69, (i_mode >> 2)&0x01 ); @@ -235,22 +217,16 @@ / No need to test for I4x4 or I_16x16 as cache_save handle that / if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_xy - 1] != 0 ) - { ctx++; - } if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 ) - { ctx++; - } x264_cabac_encode_decision_noup( cb, 64 + ctx, i_mode > 0 ); if( i_mode > 0 ) { x264_cabac_encode_decision( cb, 64 + 3, i_mode > 1 ); if( i_mode > 1 ) - { x264_cabac_encode_decision_noup( cb, 64 + 3, i_mode > 2 ); - } } } @@ -273,22 +249,16 @@ / No need to test for SKIP/PCM / if( h->mb.i_neighbour & MB_LEFT ) - { cbp_a = (h->mb.cbp[h->mb.i_mb_xy - 1] >> 4)&0x3; - } if( h->mb.i_neighbour & MB_TOP ) - { cbp_b = (h->mb.cbp[h->mb.i_mb_top_xy] >> 4)&0x3; - } ctx = 0; if( cbp_a > 0 ) ctx++; if( cbp_b > 0 ) ctx += 2; if( h->mb.i_cbp_chroma == 0 ) - { x264_cabac_encode_decision_noup( cb, 77 + ctx, 0 ); - } else { x264_cabac_encode_decision_noup( cb, 77 + ctx, 1 ); @@ -316,11 +286,8 @@ } / No need to test for PCM / SKIP / - if( h->mb.i_last_dqp && - ( h->mb.type[i_mbn_xy] == I_16x16 \|\| (h->mb.cbp[i_mbn_xy]&0x3f) ) ) - ctx = 1; - else - ctx = 0; + ctx = h->mb.i_last_dqp && + ( h->mb.type[i_mbn_xy] == I_16x16 \|\| (h->mb.cbp[i_mbn_xy]&0x3f) ); if( i_dqp != 0 ) { @@ -331,10 +298,7 @@ while( val-- ) { x264_cabac_encode_decision( cb, 60 + ctx, 1 ); - if( ctx < 2 ) - ctx = 2; - else - ctx = 3; + ctx = 2+(ctx>>1); } } x264_cabac_encode_decision_noup( cb, 60 + ctx, 0 ); @@ -353,9 +317,7 @@ static inline void x264_cabac_mb_sub_p_partition( x264_cabac_t cb, int i_sub ) { if( i_sub == D_L0_8x8 ) - { x264_cabac_encode_decision( cb, 21, 1 ); - } else if( i_sub == D_L0_8x4 ) { x264_cabac_encode_decision( cb, 21, 0 ); @@ -434,11 +396,7 @@ while( i_ref > 0 ) { x264_cabac_encode_decision( cb, 54 + ctx, 1 ); - if( ctx < 4 ) - ctx = 4; - else - ctx = 5; - + ctx = (ctx>>2)+4; i_ref--; } x264_cabac_encode_decision( cb, 54 + ctx, 0 ); @@ -678,6 +636,7 @@ { 4, 4, 4, 4, 5, 6, 7, 7 } }; +#if !RDO_SKIP_BS static void block_residual_write_cabac( x264_t h, x264_cabac_t cb, int i_ctxBlockCat, int i_idx, int16_t l, int i_count ) { const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; @@ -734,9 +693,7 @@ if( i == i_last ) { i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1; -#if !RDO_SKIP_BS i_coeff_sign[i_coeff] = l[i] < 0; -#endif i_coeff++; } @@ -753,15 +710,10 @@ { x264_cabac_encode_decision( cb, ctx, 1 ); ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; -#if RDO_SKIP_BS - cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]]; -#else for( i = 0; i < i_prefix - 1; i++ ) x264_cabac_encode_decision( cb, ctx, 1 ); if( i_prefix < 14 ) x264_cabac_encode_decision( cb, ctx, 0 ); -#endif if( i_prefix >= 14 ) x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 ); @@ -771,18 +723,110 @@ { x264_cabac_encode_decision( cb, ctx, 0 ); node_ctx = coeff_abs_level_transition[0][node_ctx]; -#if RDO_SKIP_BS - x264_cabac_encode_bypass( cb, 0 ); // sign -#endif } -#if !RDO_SKIP_BS x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] ); -#endif } while( i_coeff > 0 ); } +#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 ) + +#else + +/ Faster RDO by merging sigmap and level coding. Note that for 8x8dct + * this is slightly incorrect because the sigmap is not reversible + * (contexts are repeated). However, there is nearly no quality penalty + * for this (~0.001db) and the speed boost (~30%) is worth it. / +static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t h, x264_cabac_t cb, int i_ctxBlockCat, int i_idx, int16_t l, int i_count, int b_8x8 ) +{ + const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; + const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; + const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat]; + const uint8_t significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; + int i_last, i_coeff_abs_m1, ctx, i_prefix, i, node_ctx; + if( !b_8x8 ) + { + / coded block flag / + ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ); + if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] ) + x264_cabac_encode_decision( cb, ctx, 1 ); + else + { + x264_cabac_encode_decision( cb, ctx, 0 ); + return; + } + } + i_last = h->quantf.coeff_last[i_ctxBlockCat](l); + + i_coeff_abs_m1 = abs(l[i_last]) - 1; + i_prefix = X264_MIN( i_coeff_abs_m1, 14 ); + ctx = coeff_abs_level1_ctx[0] + i_ctx_level; + + if( i_last != i_count - 1 ) + { + x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i_last]:i_last), 1 ); + x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 ); + } + + if( i_prefix ) + { + x264_cabac_encode_decision( cb, ctx, 1 ); + ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level; + cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]]; + if( i_prefix >= 14 ) + x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 ); + node_ctx = coeff_abs_level_transition[1][0]; + } + else + { + x264_cabac_encode_decision( cb, ctx, 0 ); + node_ctx = coeff_abs_level_transition[0][0]; + x264_cabac_encode_bypass( cb, 0 ); // sign + } + + for( i = i_last-1 ; i >= 0; i-- ) + { + if( l[i] ) + { + x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 1 ); + x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 ); + ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; + + if( (unsigned)(l[i]+1) > 2 ) + { + i_coeff_abs_m1 = abs(l[i]) - 1; + i_prefix = X264_MIN( i_coeff_abs_m1, 14 ); + x264_cabac_encode_decision( cb, ctx, 1 ); + ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; + cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]]; + if( i_prefix >= 14 ) + x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 ); + node_ctx = coeff_abs_level_transition[1][node_ctx]; + } + else + { + x264_cabac_encode_decision( cb, ctx, 0 ); + node_ctx = coeff_abs_level_transition[0][node_ctx]; + x264_cabac_encode_bypass( cb, 0 ); + } + } + else + x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 0 ); + } +} + +static void block_residual_write_cabac_8x8( x264_t h, x264_cabac_t cb, int i_idx, int16_t l ) +{ + block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 ); +} +static void block_residual_write_cabac( x264_t h, x264_cabac_t cb, int i_ctxBlockCat, int i_idx, int16_t l, int i_count ) +{ + block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 ); +} +#endif void x264_macroblock_write_cabac( x264_t h, x264_cabac_t cb ) { @@ -923,18 +967,11 @@ else if( i_mb_type != B_DIRECT ) { / All B mode / - int b_list[2][2]; - - / init ref list utilisations / - for( i = 0; i < 2; i++ ) - { - b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i]; - b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i]; - } + const uint8_t (b_list)[2] = x264_mb_type_list_table[i_mb_type]; for( i_list = 0; i_list < 2; i_list++ ) { - const int i_ref_max = i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]; + const int i_ref_max = h->mb.pic.i_fref[i_list]; if( i_ref_max > 1 ) { @@ -1008,7 +1045,7 @@ { for( i = 0; i < 4; i++ ) if( h->mb.i_cbp_luma & ( 1 << i ) ) - block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i, h->dct.luma8x8[i], 64 ); + block_residual_write_cabac_8x8( h, cb, i, h->dct.luma8x8[i] ); } else { @@ -1054,8 +1091,8 @@ x264_cabac_mb_mvd( h, cb, 0, 4i8, 4>>b_8x16, 2<<b_8x16 ); else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 ) { - if( x264_mb_type_list0_table[ i_mb_type ][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4i8, 4>>b_8x16, 2<<b_8x16 ); - if( x264_mb_type_list1_table[ i_mb_type ][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4i8, 4>>b_8x16, 2<<b_8x16 ); + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4i8, 4>>b_8x16, 2<<b_8x16 ); + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4i8, 4>>b_8x16, 2<<b_8x16 ); } else if( i_mb_type == B_8x8 ) { @@ -1073,7 +1110,7 @@ if( h->mb.i_cbp_luma & (1 << i8) ) { if( h->mb.b_transform_8x8 ) - block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 ); + block_residual_write_cabac_8x8( h, cb, i8, h->dct.luma8x8[i8] ); else { int i4; @@ -1112,7 +1149,7 @@ { (uint16_t)&h->mb.cache.non_zero_count[x264_scan8[i84]] = 0x0101; (uint16_t)&h->mb.cache.non_zero_count[x264_scan8[i84+2]] = 0x0101; - block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4i8, h->dct.luma8x8[i8], 64 ); + block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] ); } else {
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/cavlc.c ^
@@ -96,7 +96,7 @@ /* Weight highly against overflows. / s->i_bits_encoded += 1000000; #else - x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code ); + x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code ); / clip level, preserving sign / i_level_code = (1<<12) - 2 + (i_level_code & 1); #endif @@ -116,8 +116,8 @@ { static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3}; static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0}; - int level[16], run[16]; - int i_trailing, i_total_zero, i_last, i_suffix_length, i; + x264_run_level_t runlevel; + int i_trailing, i_total_zero, i_suffix_length, i; int i_total = 0; unsigned int i_sign; / x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 / @@ -125,40 +125,30 @@ if( !h->mb.cache.non_zero_count[x264_scan8[i_idx]] ) { - bs_write_vlc( s, x264_coeff_token[nC][0] ); + bs_write_vlc( s, x264_coeff0_token[nC] ); return; } - i_last = h->quantf.coeff_last[i_ctxBlockCat](l); - i_total_zero = i_last + 1; - / level and run and total / / set these to 2 to allow branchless i_trailing calculation / - level[1] = 2; - level[2] = 2; - do - { - int r = 0; - level[i_total] = l[i_last]; - while( --i_last >= 0 && l[i_last] == 0 ) - r++; - run[i_total++] = r; - } while( i_last >= 0 ); + runlevel.level[1] = 2; + runlevel.level[2] = 2; + i_total = h->quantf.coeff_level_run[i_ctxBlockCat]( l, &runlevel ); + i_total_zero = runlevel.last + 1 - i_total; h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total; - i_total_zero -= i_total; - i_trailing = ((((level[0]+1) \| (1-level[0])) >> 31) & 1) // abs(level[0])>1 - \| ((((level[1]+1) \| (1-level[1])) >> 31) & 2) - \| ((((level[2]+1) \| (1-level[2])) >> 31) & 4); + i_trailing = ((((runlevel.level[0]+1) \| (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1 + \| ((((runlevel.level[1]+1) \| (1-runlevel.level[1])) >> 31) & 2) + \| ((((runlevel.level[2]+1) \| (1-runlevel.level[2])) >> 31) & 4); i_trailing = ctz_index[i_trailing]; - i_sign = ((level[2] >> 31) & 1) - \| ((level[1] >> 31) & 2) - \| ((level[0] >> 31) & 4); + i_sign = ((runlevel.level[2] >> 31) & 1) + \| ((runlevel.level[1] >> 31) & 2) + \| ((runlevel.level[0] >> 31) & 4); i_sign >>= 3-i_trailing; / total/trailing / - bs_write_vlc( s, x264_coeff_token[nC][i_total4+i_trailing] ); + bs_write_vlc( s, x264_coeff_token[nC][i_total4+i_trailing-4] ); i_suffix_length = i_total > 10 && i_trailing < 3; if( i_trailing > 0 \|\| RDO_SKIP_BS ) @@ -166,10 +156,10 @@ if( i_trailing < i_total ) { - int16_t val = level[i_trailing]; - int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2; + int16_t val = runlevel.level[i_trailing]; + int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2; if( i_trailing < 3 ) - val -= (val>>15)\|1; / as level[i] can't be 1 for the first one if i_trailing < 3 / + val -= (val>>15)\|1; / as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 / val += LEVEL_TABLE_SIZE/2; if( (unsigned)val_original < LEVEL_TABLE_SIZE ) @@ -181,7 +171,7 @@ i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 ); for( i = i_trailing+1; i < i_total; i++ ) { - val = level[i] + LEVEL_TABLE_SIZE/2; + val = runlevel.level[i] + LEVEL_TABLE_SIZE/2; if( (unsigned)val < LEVEL_TABLE_SIZE ) { bs_write_vlc( s, x264_level_token[i_suffix_length][val] ); @@ -203,8 +193,8 @@ for( i = 0; i < i_total-1 && i_total_zero > 0; i++ ) { int i_zl = X264_MIN( i_total_zero - 1, 6 ); - bs_write_vlc( s, x264_run_before[i_zl][run[i]] ); - i_total_zero -= run[i]; + bs_write_vlc( s, x264_run_before[i_zl][runlevel.run[i]] ); + i_total_zero -= runlevel.run[i]; } } @@ -441,17 +431,17 @@ } else if( i_mb_type == P_8x8 ) { - int b_sub_ref0; + int b_sub_ref; if( (h->mb.cache.ref[0][x264_scan8[0]] \| h->mb.cache.ref[0][x264_scan8[ 4]] \| h->mb.cache.ref[0][x264_scan8[8]] \| h->mb.cache.ref[0][x264_scan8[12]]) == 0 ) { bs_write_ue( s, 4 ); - b_sub_ref0 = 0; + b_sub_ref = 0; } else { bs_write_ue( s, 3 ); - b_sub_ref0 = 1; + b_sub_ref = h->mb.pic.i_fref[0] > 1; } / sub mb type / @@ -462,7 +452,7 @@ bs_write( s, 4, 0xf ); / ref0 / - if( h->mb.pic.i_fref[0] > 1 && b_sub_ref0 ) + if( b_sub_ref ) { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] ); bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] ); @@ -501,15 +491,7 @@ / Motion Vector / int i_list; DECLARE_ALIGNED_4( int16_t mvp[2] ); - - int b_list[2][2]; - - / init ref list utilisations / - for( i = 0; i < 2; i++ ) - { - b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i]; - b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i]; - } + const uint8_t (b_list)[2] = x264_mb_type_list_table[i_mb_type]; bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] ); @@ -655,8 +637,8 @@ cavlc_mb_mvd( h, &s, 0, 4i8, 4>>b_8x16 ); else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 ) { - if( x264_mb_type_list0_table[ i_mb_type ][!!i8] ) cavlc_mb_mvd( h, &s, 0, 4i8, 4>>b_8x16 ); - if( x264_mb_type_list1_table[ i_mb_type ][!!i8] ) cavlc_mb_mvd( h, &s, 1, 4i8, 4>>b_8x16 ); + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, &s, 0, 4i8, 4>>b_8x16 ); + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, &s, 1, 4*i8, 4>>b_8x16 ); } else if( i_mb_type == B_8x8 ) {
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/encoder.c ^
@@ -411,6 +411,7 @@ h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; h->param.analyse.f_psy_rd = 0; + h->param.i_bframe = 0; /* 8x8dct is not useful at all in CAVLC lossless / if( !h->param.b_cabac ) h->param.analyse.b_transform_8x8 = 0; @@ -713,6 +714,7 @@ \|\| h->param.i_bframe_adaptive \|\| h->param.b_pre_scenecut ); h->frames.b_have_lowres \|= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0); + h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8); h->frames.i_last_idr = - h->param.i_keyint_max; h->frames.i_input = 0; @@ -824,7 +826,9 @@ COPY( analyse.intra ); COPY( analyse.inter ); COPY( analyse.i_direct_mv_pred ); - COPY( analyse.i_me_range ); + / Scratch buffer prevents me_range from being increased for esa/tesa / + if( h->param.analyse.i_me_method < X264_ME_ESA \|\| param->analyse.i_me_range < h->param.analyse.i_me_range ) + COPY( analyse.i_me_range ); COPY( analyse.i_noise_reduction ); / We can't switch out of subme=0 during encoding. / if( h->param.analyse.i_subpel_refine ) @@ -839,6 +843,8 @@ // can only twiddle these if they were enabled to begin with: if( h->param.analyse.i_me_method >= X264_ME_ESA \|\| param->analyse.i_me_method < X264_ME_ESA ) COPY( analyse.i_me_method ); + if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa ) + h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8; if( h->pps->b_transform_8x8_mode ) COPY( analyse.b_transform_8x8 ); if( h->frames.i_max_ref1 > 1 ) @@ -1046,7 +1052,7 @@ x264_pixel_ssim_wxh( &h->pixf, h->fdec->plane[0] + 2+min_yh->fdec->i_stride[0], h->fdec->i_stride[0], h->fenc->plane[0] + 2+min_yh->fenc->i_stride[0], h->fenc->i_stride[0], - h->param.i_width-2, max_y-min_y ); + h->param.i_width-2, max_y-min_y, h->scratch_buffer ); } } @@ -1433,7 +1439,7 @@ return 0; } - x264_slicetype_decide( h ); + x264_stack_align( x264_slicetype_decide, h ); / 3: move some B-frames and 1 non-B to encode queue / while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) ) @@ -1976,8 +1982,8 @@ for( i = 0; i < X264_PARTTYPE_MAX; i++ ) for( j = 0; j < 2; j++ ) { - int l0 = x264_mb_type_list0_table[i][j]; - int l1 = x264_mb_type_list1_table[i][j]; + int l0 = x264_mb_type_list_table[i][0][j]; + int l1 = x264_mb_type_list_table[i][1][j]; if( l0 \|\| l1 ) list_count[l1+l0l1] += h->stat.i_mb_count[SLICE_TYPE_B][i] * 2; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/me.c ^
@@ -474,8 +474,7 @@ DECLARE_ALIGNED_16( int enc_dc[4] ); int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; int delta = x264_pixel_size[sad_size].w; - int16_t xs_buf[64]; - int16_t xs = width<=64 ? xs_buf : x264_malloc( (width+15)sizeof(int16_t) ); + int16_t xs = h->scratch_buffer; int xn; uint16_t cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2); @@ -492,11 +491,7 @@ if( h->mb.i_me_method == X264_ME_TESA ) { // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD - typedef struct { - int sad; - int16_t mx, my; - } mvsad_t; - mvsad_t mvsads = x264_malloc( width(max_y-min_y+1)sizeof(mvsad_t) ); + mvsad_t mvsads = (mvsad_t )(xs + ((width+15)&~15)); int nmvsad = 0, limit; int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmystride+bmx, stride ) @@ -581,7 +576,6 @@ } for( i=0; i<nmvsad; i++ ) COST_MV( mvsads[i].mx, mvsads[i].my ); - x264_free( mvsads ); } else { @@ -601,9 +595,6 @@ COST_MV( min_x+xs[i], my ); } } - - if( xs != xs_buf ) - x264_free( xs ); #endif } break;
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/me.h ^
@@ -48,6 +48,11 @@ DECLARE_ALIGNED_4( int16_t mv[2] ); } DECLARE_ALIGNED_16( x264_me_t ); +typedef struct { + int sad; + int16_t mx, my; +} mvsad_t; + void x264_me_search_ref( x264_t h, x264_me_t m, int16_t (mvc)[2], int i_mvc, int p_fullpel_thresh ); static inline void x264_me_search( x264_t h, x264_me_t m, int16_t (*mvc)[2], int i_mvc ) { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/ratecontrol.c ^
@@ -174,8 +174,8 @@ * and putting it after floating point ops. As a result, we put the emms at the end of the * function and make sure that its always called before the float math. Noinline makes * sure no reordering goes on. / - unsigned int var=0, sad, i; - for( i=0; i<3; i++ ) + unsigned int var = 0, i; + for( i = 0; i < 3; i++ ) { int w = i ? 8 : 16; int stride = frame->i_stride[i]; @@ -184,7 +184,7 @@ : w (mb_x + mb_y * stride); int pix = i ? PIXEL_8x8 : PIXEL_16x16; stride <<= h->mb.b_interlaced; - var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad ); + var += h->pixf.var[pix]( frame->plane[i]+offset, stride ); } var = X264_MAX(var,1); x264_emms(); @@ -441,6 +441,12 @@ if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR ) x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" ); + if( !strstr( opts, "direct=3" ) && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ) + { + x264_log( h, X264_LOG_WARNING, "direct=auto not used on the first pass\n" ); + h->mb.b_direct_auto_write = 1; + } + if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS ) h->param.i_bframe_adaptive = i; else if( h->param.i_bframe ) @@ -612,7 +618,7 @@ p += len; if( !p ) return 0; - z->param = malloc( sizeof(x264_param_t) ); + z->param = x264_malloc( sizeof(x264_param_t) ); memcpy( z->param, &h->param, sizeof(x264_param_t) ); while( (tok = strtok_r( p, ",", &saveptr )) ) { @@ -1507,7 +1513,7 @@ expected_size = qscale2bits(&rce, q); expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size; } - rcc->last_satd = x264_rc_analyse_slice( h ); + rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h ); } q = x264_clip3f( q, lmin, lmax ); } @@ -1525,7 +1531,7 @@ double wanted_bits, overflow=1, lmin, lmax; - rcc->last_satd = x264_rc_analyse_slice( h ); + rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h ); rcc->short_term_cplxsum = 0.5; rcc->short_term_cplxcount *= 0.5; rcc->short_term_cplxsum += rcc->last_satd;
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/encoder/slicetype.c ^
@@ -489,7 +489,7 @@ if( !h->frames.last_nonb ) return; frames[0] = h->frames.last_nonb; - for( j = 0; h->frames.next[j]; j++ ) + for( j = 0; h->frames.next[j] && h->frames.next[j]->i_type == X264_TYPE_AUTO; j++ ) frames[j+1] = h->frames.next[j]; keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1; num_frames = X264_MIN( j, keyint_limit ); @@ -630,10 +630,8 @@ frm->i_type = X264_TYPE_P; } - if( frm->i_type != X264_TYPE_AUTO && frm->i_type != X264_TYPE_B && frm->i_type != X264_TYPE_BREF ) - break; - - frm->i_type = X264_TYPE_B; + if( frm->i_type == X264_TYPE_AUTO ) frm->i_type = X264_TYPE_B; + else if( !IS_X264_TYPE_B( frm->i_type ) ) break; } }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/muxers.c ^
@@ -290,7 +290,7 @@ header[slen] = 0; if (strncmp(header, Y4M_FRAME_MAGIC, slen)) { - fprintf(stderr, "Bad header magic (%08X <=> %s)\n", + fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n", ((uint32_t)header), header); return -1; }
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/tools/checkasm.c ^
@@ -156,7 +156,8 @@ b->cpu&X264_CPU_MMX ? "mmx" : "c", b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : - b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "", + b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : + b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "", ((int64_t)10b->cycles/b->den - nop_time)/4 ); } } @@ -330,16 +331,15 @@ #define TEST_PIXEL_VAR( i ) \ if( pixel_asm.var[i] != pixel_ref.var[i] ) \ { \ - uint32_t res_c, res_asm; \ - uint32_t sad_c, sad_asm; \ + int res_c, res_asm; \ set_func_name( "%s_%s", "var", pixel_names[i] ); \ used_asm = 1; \ - res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \ - res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \ - if( (res_c != res_asm) \|\| (sad_c != sad_asm) ) \ + res_c = call_c( pixel_c.var[i], buf1, 16 ); \ + res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \ + if( res_c != res_asm ) \ { \ ok = 0; \ - fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \ + fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ } \ } @@ -408,8 +408,8 @@ int sums[5][4] = {{0}}; used_asm = ok = 1; x264_emms(); - res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 ); - res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 ); + res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28, buf3 ); + res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28, buf3 ); if( fabs(res_c - res_a) > 1e-6 ) { ok = 0; @@ -793,12 +793,13 @@ uint8_t src = buf1+8+264; uint8_t dstc[3] = { buf3+8, buf3+8+1664, buf3+8+3264 }; uint8_t dsta[3] = { buf4+8, buf4+8+1664, buf4+8+3264 }; + void tmp = buf3+4964; set_func_name( "hpel_filter" ); ok = 1; used_asm = 1; memset( buf3, 0, 4096 ); memset( buf4, 0, 4096 ); - call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10 ); - call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10 ); + call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10, tmp ); + call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10, tmp ); for( i=0; i<3; i++ ) for( j=0; j<10; j++ ) //FIXME ideally the first pixels would match too, but they aren't actually used @@ -822,33 +823,57 @@ uint8_t dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 }; uint8_t dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 }; set_func_name( "lowres_init" ); + ok = 1; used_asm = 1; for( w=40; w<=48; w+=8 ) - if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) + { + int stride = (w+8)&~15; + call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w2, stride, w, 16 ); + call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w2, stride, w, 16 ); + for( i=0; i<16; i++) { - int stride = (w+8)&~15; - used_asm = 1; - call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w2, stride, w, 16 ); - call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w2, stride, w, 16 ); - for( i=0; i<16; i++) - { - for( j=0; j<4; j++) - if( memcmp( dstc[j]+istride, dsta[j]+istride, w ) ) - { - ok = 0; - fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i ); - for( k=0; k<w; k++ ) - printf( "%d ", dstc[j][k+istride] ); - printf("\n"); - for( k=0; k<w; k++ ) - printf( "%d ", dsta[j][k+istride] ); - printf("\n"); - break; - } - } + for( j=0; j<4; j++) + if( memcmp( dstc[j]+istride, dsta[j]+istride, w ) ) + { + ok = 0; + fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i ); + for( k=0; k<w; k++ ) + printf( "%d ", dstc[j][k+istride] ); + printf("\n"); + for( k=0; k<w; k++ ) + printf( "%d ", dsta[j][k+istride] ); + printf("\n"); + break; + } } + } report( "lowres init :" ); } +#define INTEGRAL_INIT( name, size, ... )\ + if( mc_a.name != mc_ref.name )\ + {\ + int stride = 80;\ + set_func_name( #name );\ + used_asm = 1;\ + memcpy( buf3, buf1, size2stride );\ + memcpy( buf4, buf1, size2stride );\ + uint16_t sum = (uint16_t)buf3;\ + call_c1( mc_c.name, __VA_ARGS__ );\ + sum = (uint16_t)buf4;\ + call_a1( mc_a.name, __VA_ARGS__ );\ + if( memcmp( buf3, buf4, (stride-8)2 )\ + \|\| (size>9 && memcmp( buf3+18stride, buf4+18stride, (stride-8)2 )))\ + ok = 0;\ + call_c2( mc_c.name, __VA_ARGS__ );\ + call_a2( mc_a.name, __VA_ARGS__ );\ + } + ok = 1; used_asm = 0; + INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride ); + INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride ); + INTEGRAL_INIT( integral_init4v, 14, sum, sum+9stride, stride ); + INTEGRAL_INIT( integral_init8v, 9, sum, stride ); + report( "integral init :" ); + return ret; } @@ -1104,7 +1129,7 @@ ok = oks[1]; used_asm = used_asms[1]; report( "dequant :" ); - ok = 1; + ok = 1; used_asm = 0; if( qf_a.denoise_dct != qf_ref.denoise_dct ) { int size; @@ -1137,21 +1162,18 @@ dct1[idx] = !(rand()&3) + (!(rand()&15))(rand()&3); \ if( ac ) \ dct1[0] = 0; \ - memcpy( dct2, dct1, ww2 ); \ - result_c = call_c1( qf_c.decname, (void)dct2 ); \ - result_a = call_a1( qf_a.decname, (void)dct2 ); \ + result_c = call_c( qf_c.decname, (void)dct1 ); \ + result_a = call_a( qf_a.decname, (void)dct1 ); \ if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \ { \ ok = 0; \ fprintf( stderr, #decname ": [FAILED]\n" ); \ break; \ } \ - call_c2( qf_c.decname, (void)dct2 ); \ - call_a2( qf_a.decname, (void)dct2 ); \ } \ } - ok = 1; + ok = 1; used_asm = 0; TEST_DECIMATE( decimate_score64, 8, 0, 6 ); TEST_DECIMATE( decimate_score16, 4, 0, 6 ); TEST_DECIMATE( decimate_score15, 4, 1, 7 ); @@ -1171,27 +1193,60 @@ nnz \|= dct1[idx] = !(rand()&3) + (!(rand()&15))rand(); \ if( !nnz ) \ dct1[ac] = 1; \ - memcpy( dct2, dct1, ww2 ); \ - result_c = call_c1( qf_c.last, (void)(dct2+ac) ); \ - result_a = call_a1( qf_a.last, (void)(dct2+ac) ); \ + result_c = call_c( qf_c.last, (void)(dct1+ac) ); \ + result_a = call_a( qf_a.last, (void)(dct1+ac) ); \ if( result_c != result_a ) \ { \ ok = 0; \ fprintf( stderr, #lastname ": [FAILED]\n" ); \ break; \ } \ - call_c2( qf_c.last, (void)(dct2+ac) ); \ - call_a2( qf_a.last, (void)(dct2+ac) ); \ } \ } - ok = 1; + ok = 1; used_asm = 0; TEST_LAST( coeff_last[DCT_CHROMA_DC], coeff_last4, 2, 0 ); TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 4, 1 ); TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 ); TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 ); report( "coeff_last :" ); +#define TEST_LEVELRUN( lastname, name, w, ac ) \ + if( qf_a.lastname != qf_ref.lastname ) \ + { \ + set_func_name( #name ); \ + used_asm = 1; \ + for( i = 0; i < 100; i++ ) \ + { \ + x264_run_level_t runlevel_c, runlevel_a; \ + int result_c, result_a, idx, nnz=0; \ + int max = rand() & (ww-1); \ + memset( dct1, 0, ww2 ); \ + memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \ + memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \ + for( idx = ac; idx < max; idx++ ) \ + nnz \|= dct1[idx] = !(rand()&3) + (!(rand()&15))rand(); \ + if( !nnz ) \ + dct1[ac] = 1; \ + result_c = call_c( qf_c.lastname, (void)(dct1+ac), &runlevel_c ); \ + result_a = call_a( qf_a.lastname, (void)(dct1+ac), &runlevel_a ); \ + if( result_c != result_a \|\| runlevel_c.last != runlevel_a.last \|\| \ + memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)result_c) \|\| \ + memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \ + { \ + ok = 0; \ + fprintf( stderr, #name ": [FAILED]\n" ); \ + break; \ + } \ + } \ + } + + ok = 1; used_asm = 0; + TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC], coeff_level_run4, 2, 0 ); + TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 4, 1 ); + TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 ); + report( "coeff_level_run :" ); + return ret; } @@ -1338,6 +1393,11 @@ ret \|= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif + if( x264_cpu_detect() & X264_CPU_LZCNT ) + { + ret \|= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } } if( x264_cpu_detect() & X264_CPU_SSE2 ) { @@ -1351,6 +1411,12 @@ ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" ); cpu1 &= ~X264_CPU_SSE_MISALIGN; } + if( x264_cpu_detect() & X264_CPU_LZCNT ) + { + cpu1 &= ~X264_CPU_CACHELINE_64; + ret \|= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } if( x264_cpu_detect() & X264_CPU_SSE3 ) ret \|= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 \| X264_CPU_CACHELINE_64, "SSE3" ); if( x264_cpu_detect() & X264_CPU_SSSE3 )
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/x264.c ^
@@ -220,7 +220,9 @@ " where <option> is either\n" " q=<integer> (force QP)\n" " or b=<float> (bitrate multiplier)\n" ); - H1( " --qpfile <string> Force frametypes and QPs\n" ); + H1( " --qpfile <string> Force frametypes and QPs for some or all frames\n" + " Format of each line: framenumber frametype QP\n" + " QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" ); H0( "\n" ); H0( "Analysis:\n" ); H0( "\n" ); @@ -563,8 +565,6 @@ fprintf( stderr, "x264 [error]: can't open `%s'\n", optarg ); return -1; } - param->i_scenecut_threshold = -1; - param->i_bframe_adaptive = X264_B_ADAPT_NONE; break; case OPT_THREAD_INPUT: b_thread_input = 1;
[-] [+]	Changed	x264-snapshot-20090119-2245.tar.bz2/x264.h ^
@@ -62,6 +62,7 @@ #define X264_CPU_SSE4 0x002000 /* SSE4.1 / #define X264_CPU_SSE42 0x004000 / SSE4.2 / #define X264_CPU_SSE_MISALIGN 0x008000 / Phenom support for misaligned SSE instruction arguments / +#define X264_CPU_LZCNT 0x010000 / Phenom support for "leading zero count" instruction. / / Analyse flags / @@ -341,7 +342,11 @@ typedef struct { - / In: force picture type (if not auto) XXX: ignored for now + /* In: force picture type (if not auto) + * If x264 encoding parameters are violated in the forcing of picture types, + * x264 will correct the input picture type and log a warning. + * The quality of frametype decisions may suffer if a great deal of fine-grained + * mixing of auto and forced frametypes is done. * Out: type of the picture encoded / int i_type; / In: force quantizer for > 0 */
	Changed	x264-snapshot-20090228-2245.tar.bz2 ^
	Changed	x264-snapshot-20090627-2245.tar.bz2 ^