[-]
[+]
|
Changed |
x264.spec
|
|
|
Deleted |
x264-snapshot-20081218-2245.tar.bz2/.git/objects/pack/pack-af7017097b709ffa675014eec71010e10908193f.idx
^
|
|
Deleted |
x264-snapshot-20081218-2245.tar.bz2/.git/objects/pack/pack-af7017097b709ffa675014eec71010e10908193f.pack
^
|
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/.git/index
^
|
|
Added |
x264-snapshot-20090119-2245.tar.bz2/.git/objects/pack/pack-7e284d41e3d870f5e6bd5c2ad5b36d1e4f0910d9.idx
^
|
|
Added |
x264-snapshot-20090119-2245.tar.bz2/.git/objects/pack/pack-7e284d41e3d870f5e6bd5c2ad5b36d1e4f0910d9.pack
^
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/.git/refs/heads/master
^
|
@@ -1 +1 @@
-5f8a1490eb0bc2a934c34bc8307bfdc1ade6a92d
+a48d1d0a2ad590d041b79bb152ed47d00451ba8d
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/.git/refs/heads/origin
^
|
@@ -1 +1 @@
-5f8a1490eb0bc2a934c34bc8307bfdc1ade6a92d
+a48d1d0a2ad590d041b79bb152ed47d00451ba8d
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/bs.h
^
|
@@ -50,10 +50,18 @@
int i_bits_encoded; /* RD only */
} bs_t;
-extern const vlc_t x264_coeff_token[5][17*4];
+typedef struct
+{
+ int last;
+ int16_t level[16];
+ uint8_t run[16];
+} x264_run_level_t;
+
+extern const vlc_t x264_coeff0_token[5];
+extern const vlc_t x264_coeff_token[5][16*4];
extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_dc[3][4];
-extern const vlc_t x264_run_before[7][15];
+extern const vlc_t x264_run_before[7][16];
/* A larger level table size theoretically could help a bit at extremely
* high bitrates, but the cost in cache is usually too high for it to be
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/cabac.c
^
|
@@ -742,41 +742,6 @@
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
-static const uint8_t x264_cabac_probability[128] =
-{
- FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781),
- FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730),
- FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667),
- FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590),
- FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495),
- FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378),
- FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234),
- FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057),
- FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838),
- FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569),
- FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237),
- FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828),
- FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325),
- FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705),
- FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941),
- FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000),
- FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276),
- FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472),
- FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818),
- FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288),
- FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857),
- FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508),
- FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224),
- FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994),
- FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807),
- FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655),
- FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532),
- FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432),
- FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350),
- FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284),
- FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231),
- FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187)
-};
/* -ln2(probability) */
#define F(a,b) {FIX8(a),FIX8(b)}
const uint16_t x264_cabac_entropy[128][2] =
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/common.c
^
|
@@ -247,7 +247,7 @@
if( b_error )
{
char *buf = strdup(value);
- char *tok, *saveptr, *init;
+ char *tok, UNUSED *saveptr, *init;
b_error = 0;
p->cpu = 0;
for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/common.h
^
|
@@ -338,6 +338,7 @@
int i_max_ref1;
int i_delay; /* Number of frames buffered for B reordering */
int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */
+ int b_have_sub8x8_esa;
} frames;
/* current frame being encoded */
@@ -604,6 +605,8 @@
} stat;
+ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+
/* CPU functions dependents */
x264_predict_t predict_16x16[4+3];
x264_predict_t predict_8x8c[4+3];
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/cpu.c
^
|
@@ -33,6 +33,11 @@
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
+#ifdef SYS_OPENBSD
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
#include "common.h"
#include "cpu.h"
@@ -54,6 +59,7 @@
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
+ {"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"", 0},
};
@@ -117,6 +123,7 @@
{
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN;
+ cpu |= X264_CPU_LZCNT;
x264_cpu_mask_misalign_sse();
}
else
@@ -192,13 +199,17 @@
#elif defined( ARCH_PPC )
-#ifdef SYS_MACOSX
+#if defined(SYS_MACOSX) || defined(SYS_OPENBSD)
#include <sys/sysctl.h>
uint32_t x264_cpu_detect( void )
{
/* Thank you VLC */
uint32_t cpu = 0;
+#ifdef SYS_OPENBSD
+ int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+#else
int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+#endif
int has_altivec = 0;
size_t length = sizeof( has_altivec );
int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
@@ -292,10 +303,15 @@
get_system_info( &info );
return info.cpu_count;
-#elif defined(SYS_MACOSX) || defined(SYS_FREEBSD)
+#elif defined(SYS_MACOSX) || defined(SYS_FREEBSD) || defined(SYS_OPENBSD)
int numberOfCPUs;
size_t length = sizeof( numberOfCPUs );
+#ifdef SYS_OPENBSD
+ int mib[2] = { CTL_HW, HW_NCPU };
+ if( sysctl(mib, 2, &numberOfCPUs, &length, NULL, 0) )
+#else
if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) )
+#endif
{
numberOfCPUs = 1;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/cpu.h
^
|
@@ -35,7 +35,8 @@
* This applies only to x86_32, since other architectures that need alignment
* also have ABIs that ensure aligned stack. */
#if defined(ARCH_X86) && defined(HAVE_MMX)
-void x264_stack_align( void (*func)(x264_t*), x264_t *arg );
+int x264_stack_align( void (*func)(x264_t*), x264_t *arg );
+#define x264_stack_align(func,arg) x264_stack_align((void (*)(x264_t*))func,arg)
#else
#define x264_stack_align(func,arg) func(arg)
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/frame.c
^
|
@@ -99,7 +99,7 @@
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[3],
- 2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
+ frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/macroblock.c
^
|
@@ -23,6 +23,7 @@
*****************************************************************************/
#include "common.h"
+#include "encoder/me.h"
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
{
@@ -140,8 +141,8 @@
int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
if( i_refa == -2 || i_refb == -2 ||
- ( i_refa == 0 && *(uint32_t*)mv_a == 0 ) ||
- ( i_refb == 0 && *(uint32_t*)mv_b == 0 ) )
+ !( i_refa | *(uint32_t*)mv_a ) ||
+ !( i_refb | *(uint32_t*)mv_b ) )
{
*(uint32_t*)mv = 0;
}
@@ -730,17 +731,9 @@
}
else /* B_*x* */
{
- int b_list0[2];
- int b_list1[2];
+ const uint8_t *b_list0 = x264_mb_type_list_table[h->mb.i_type][0];
+ const uint8_t *b_list1 = x264_mb_type_list_table[h->mb.i_type][1];
- int i;
-
- /* init ref list utilisations */
- for( i = 0; i < 2; i++ )
- {
- b_list0[i] = x264_mb_type_list0_table[h->mb.i_type][i];
- b_list1[i] = x264_mb_type_list1_table[h->mb.i_type][i];
- }
if( h->mb.i_partition == D_16x16 )
{
if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
@@ -846,6 +839,13 @@
h->mb.i_neighbour4[15] =
h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
+ int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
+ int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+ int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+ int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+ ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+ CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
+
return 0;
fail: return -1;
}
@@ -871,6 +871,7 @@
x264_free( h->mb.skipbp );
x264_free( h->mb.cbp );
x264_free( h->mb.qp );
+ x264_free( h->scratch_buffer );
}
void x264_macroblock_slice_init( x264_t *h )
{
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/macroblock.h
^
|
@@ -91,31 +91,18 @@
B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1,
B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP
};
-static const uint8_t x264_mb_type_list0_table[X264_MBTYPE_MAX][2] =
+static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] =
{
- {0,0}, {0,0}, {0,0}, {0,0}, /* INTRA */
- {1,1}, /* P_L0 */
- {0,0}, /* P_8x8 */
- {1,1}, /* P_SKIP */
- {0,0}, /* B_DIRECT */
- {1,1}, {1,0}, {1,1}, /* B_L0_* */
- {0,1}, {0,0}, {0,1}, /* B_L1_* */
- {1,1}, {1,0}, {1,1}, /* B_BI_* */
- {0,0}, /* B_8x8 */
- {0,0} /* B_SKIP */
-};
-static const uint8_t x264_mb_type_list1_table[X264_MBTYPE_MAX][2] =
-{
- {0,0}, {0,0}, {0,0}, {0,0}, /* INTRA */
- {0,0}, /* P_L0 */
- {0,0}, /* P_8x8 */
- {0,0}, /* P_SKIP */
- {0,0}, /* B_DIRECT */
- {0,0}, {0,1}, {0,1}, /* B_L0_* */
- {1,0}, {1,1}, {1,1}, /* B_L1_* */
- {1,0}, {1,1}, {1,1}, /* B_BI_* */
- {0,0}, /* B_8x8 */
- {0,0} /* B_SKIP */
+ {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, /* INTRA */
+ {{1,1},{0,0}}, /* P_L0 */
+ {{0,0},{0,0}}, /* P_8x8 */
+ {{1,1},{0,0}}, /* P_SKIP */
+ {{0,0},{0,0}}, /* B_DIRECT */
+ {{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}}, /* B_L0_* */
+ {{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}}, /* B_L1_* */
+ {{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}}, /* B_BI_* */
+ {{0,0},{0,0}}, /* B_8x8 */
+ {{0,0},{0,0}} /* B_SKIP */
};
#define IS_SUB4x4(type) ( (type ==D_L0_4x4)||(type ==D_L1_4x4)||(type ==D_BI_4x4))
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/mc.c
^
|
@@ -132,9 +132,8 @@
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
- int stride, int width, int height )
+ int stride, int width, int height, int16_t *buf )
{
- int16_t *buf = x264_malloc((width+5)*sizeof(int16_t));
int x, y;
for( y=0; y<height; y++ )
{
@@ -153,7 +152,6 @@
dstc += stride;
src += stride;
}
- x264_free(buf);
}
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
@@ -269,6 +267,42 @@
memset( dst, 0, n );
}
+static void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+{
+ int x, v = pix[0]+pix[1]+pix[2]+pix[3];
+ for( x=0; x<stride-4; x++ )
+ {
+ sum[x] = v + sum[x-stride];
+ v += pix[x+4] - pix[x];
+ }
+}
+
+static void integral_init8h( uint16_t *sum, uint8_t *pix, int stride )
+{
+ int x, v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
+ for( x=0; x<stride-8; x++ )
+ {
+ sum[x] = v + sum[x-stride];
+ v += pix[x+8] - pix[x];
+ }
+}
+
+static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+{
+ int x;
+ for( x=0; x<stride-8; x++ )
+ sum4[x] = sum8[x+4*stride] - sum8[x];
+ for( x=0; x<stride-8; x++ )
+ sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
+}
+
+static void integral_init8v( uint16_t *sum8, int stride )
+{
+ int x;
+ for( x=0; x<stride-8; x++ )
+ sum8[x] = sum8[x+8*stride] - sum8[x];
+}
+
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
uint8_t *src = frame->plane[0];
@@ -353,6 +387,11 @@
pf->memzero_aligned = memzero_aligned;
pf->frame_init_lowres_core = frame_init_lowres_core;
+ pf->integral_init4h = integral_init4h;
+ pf->integral_init8h = integral_init8h;
+ pf->integral_init4v = integral_init4v;
+ pf->integral_init8v = integral_init8v;
+
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
@@ -370,7 +409,7 @@
int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
- int x, y;
+ int y;
if( mb_y & b_interlaced )
return;
@@ -382,7 +421,8 @@
frame->filtered[2] + offs,
frame->filtered[3] + offs,
frame->plane[0] + offs,
- stride, width + 16, height - start );
+ stride, width + 16, height - start,
+ h->scratch_buffer );
}
/* generate integral image:
@@ -398,23 +438,25 @@
start = -PADV;
}
if( b_end )
- height += PADV-8;
+ height += PADV-9;
for( y = start; y < height; y++ )
{
- uint8_t *ref = frame->plane[0] + y * stride - PADH;
- uint16_t *line = frame->integral + (y+1) * stride - PADH + 1;
- uint16_t v = line[0] = 0;
- for( x = 1; x < stride-1; x++ )
- line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
- line -= 8*stride;
- if( y >= 9-PADV )
+ uint8_t *pix = frame->plane[0] + y * stride - PADH;
+ uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
+ uint16_t *sum4;
+ if( h->frames.b_have_sub8x8_esa )
+ {
+ h->mc.integral_init4h( sum8, pix, stride );
+ sum8 -= 8*stride;
+ sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
+ if( y >= 8-PADV )
+ h->mc.integral_init4v( sum8, sum4, stride );
+ }
+ else
{
- uint16_t *sum4 = line + stride * (frame->i_lines[0] + PADV*2);
- for( x = 1; x < stride-8; x++, line++, sum4++ )
- {
- sum4[0] = line[4+4*stride] - line[4] - line[4*stride] + line[0];
- line[0] += line[8+8*stride] - line[8] - line[8*stride];
- }
+ h->mc.integral_init8h( sum8, pix, stride );
+ if( y >= 8-PADV )
+ h->mc.integral_init8v( sum8-8*stride, stride );
}
}
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/mc.h
^
|
@@ -55,7 +55,7 @@
uint8_t *src, int i_src, int w, int h);
void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
- int i_stride, int i_width, int i_height );
+ int i_stride, int i_width, int i_height, int16_t *buf );
/* prefetch the next few macroblocks of fenc or fdec */
void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
@@ -66,6 +66,12 @@
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
+ /* successive elimination prefilter */
+ void (*integral_init4h)( uint16_t *sum, uint8_t *pix, int stride );
+ void (*integral_init8h)( uint16_t *sum, uint8_t *pix, int stride );
+ void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
+ void (*integral_init8v)( uint16_t *sum8, int stride );
+
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
} x264_mc_functions_t;
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/osdep.h
^
|
@@ -47,6 +47,7 @@
#define fseek _fseeki64
#define ftell _ftelli64
#define isfinite _finite
+#define strtok_r strtok_s
#define _CRT_SECURE_NO_DEPRECATE
#define X264_VERSION "" // no configure script for msvc
#endif
@@ -169,7 +170,7 @@
}
#endif
-#ifdef __GNUC__
+#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3)
#define x264_clz(x) __builtin_clz(x)
#else
static int ALWAYS_INLINE x264_clz( uint32_t x )
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/pixel.c
^
|
@@ -140,7 +140,7 @@
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
+static int name( uint8_t *pix, int i_stride ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
int x, y; \
@@ -154,7 +154,6 @@
pix += i_stride; \
} \
var = sqr - (sum * sum >> shift); \
- *sad = sum; \
return var; \
}
@@ -489,12 +488,12 @@
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
uint8_t *pix1, int stride1,
uint8_t *pix2, int stride2,
- int width, int height )
+ int width, int height, void *buf )
{
int x, y, z;
float ssim = 0.0;
- int (*sum0)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
- int (*sum1)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
+ int (*sum0)[4] = buf;
+ int (*sum1)[4] = sum0 + width/4+3;
width >>= 2;
height >>= 2;
z = 0;
@@ -509,8 +508,6 @@
for( x = 0; x < width-1; x += 4 )
ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
}
- x264_free(sum0);
- x264_free(sum1);
return ssim;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/pixel.h
^
|
@@ -74,7 +74,7 @@
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
- int (*var[4])( uint8_t *pix, int stride, uint32_t *sad );
+ int (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
@@ -104,6 +104,6 @@
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
-float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
+float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height, void *buf );
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/dct.c
^
|
@@ -21,10 +21,6 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifdef SYS_LINUX
-#include <altivec.h>
-#endif
-
#include "common/common.h"
#include "ppccommon.h"
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/deblock.c
^
|
@@ -18,10 +18,6 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#if defined SYS_LINUX
-#include <altivec.h>
-#endif
-
#include "common/common.h"
#include "ppccommon.h"
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/mc.c
^
|
@@ -27,10 +27,6 @@
#include <stdint.h>
#include <stdarg.h>
-#ifdef SYS_LINUX
-#include <altivec.h>
-#endif
-
#include "x264.h"
#include "common/common.h"
#include "common/mc.h"
@@ -545,7 +541,7 @@
}
void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
- int i_stride, int i_width, int i_height )
+ int i_stride, int i_width, int i_height, int16_t *buf )
{
int x, y;
@@ -563,7 +559,7 @@
vec_u16_t twov, fourv, fivev, sixv;
vec_s16_t sixteenv, thirtytwov;
- vect_ushort_u temp_u;
+ vec_u16_u temp_u;
temp_u.s[0]=2;
twov = vec_splat( temp_u.v, 0 );
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/pixel.c
^
|
@@ -21,10 +21,6 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifdef SYS_LINUX
-#include <altivec.h>
-#endif
-
#include "common/common.h"
#include "ppccommon.h"
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/ppccommon.h
^
|
@@ -18,6 +18,10 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
/***********************************************************************
* For constant vectors, use parentheses on OS X and braces on Linux
**********************************************************************/
@@ -38,19 +42,24 @@
#define vec_s32_t vector signed int
typedef union {
- unsigned int s[4];
- vector unsigned int v;
-} vect_int_u;
+ uint32_t s[4];
+ vec_u32_t v;
+} vec_u32_u;
+
+typedef union {
+ uint16_t s[8];
+ vec_u16_t v;
+} vec_u16_u;
typedef union {
- unsigned short s[8];
- vector unsigned short v;
-} vect_ushort_u;
+ int16_t s[8];
+ vec_s16_t v;
+} vec_s16_u;
typedef union {
- signed short s[8];
- vector signed short v;
-} vect_sshort_u;
+ uint8_t s[16];
+ vec_u8_t v;
+} vec_u8_u;
/***********************************************************************
* Null vector
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/predict.c
^
|
@@ -1,7 +1,7 @@
/*****************************************************************************
* predict.c: h264 encoder
*****************************************************************************
- * Copyright (C) 2007-2008 Guillaume Poirier <gpoirier@mplayerhq.hu>
+ * Copyright (C) 2007-2009 Guillaume Poirier <gpoirier@mplayerhq.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,15 +18,65 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#ifdef SYS_LINUX
-#include <altivec.h>
-#endif
-
#include "common/common.h"
#include "predict.h"
#include "pixel.h"
#include "ppccommon.h"
+static void predict_8x8c_p_altivec( uint8_t *src )
+{
+ int i;
+ int a, b, c;
+ int H = 0;
+ int V = 0;
+ int i00;
+
+ for( i = 0; i < 4; i++ )
+ {
+ H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] );
+ V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] );
+ }
+
+ a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
+ b = ( 17 * H + 16 ) >> 5;
+ c = ( 17 * V + 16 ) >> 5;
+ i00 = a -3*b -3*c + 16;
+
+ vec_s16_u i00_u, b_u, c_u;
+ i00_u.s[0] = i00;
+ b_u.s[0] = b;
+ c_u.s[0] = c;
+
+ vec_u16_t val5_v = vec_splat_u16(5);
+ vec_s16_t i00_v, b_v, c_v;
+ i00_v = vec_splat(i00_u.v, 0);
+ b_v = vec_splat(b_u.v, 0);
+ c_v = vec_splat(c_u.v, 0);
+
+ vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
+ vec_s32_t mule_b_v = vec_mule(induc_v, b_v);
+ vec_s32_t mulo_b_v = vec_mulo(induc_v, b_v);
+ vec_s16_t mul_b_induc0_v = vec_pack(vec_mergeh(mule_b_v, mulo_b_v), vec_mergel(mule_b_v, mulo_b_v));
+ vec_s16_t add_i0_b_0v = vec_adds(i00_v, mul_b_induc0_v);
+
+ PREP_STORE8;
+
+ for( i = 0; i < 8; ++i )
+ {
+ vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
+ vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_0_v);
+ VEC_STORE8(com_sat_v, &src[0]);
+ src += FDEC_STRIDE;
+ add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
+
+ }
+}
+
+
+/****************************************************************************
+ * 16x16 prediction for intra luma block
+ ****************************************************************************/
+
static void predict_16x16_p_altivec( uint8_t *src )
{
int16_t a, b, c, i;
@@ -45,7 +95,7 @@
c = ( 5 * V + 32 ) >> 6;
i00 = a - b * 7 - c * 7 + 16;
- vect_sshort_u i00_u, b_u, c_u;
+ vec_s16_u i00_u, b_u, c_u;
i00_u.s[0] = i00;
b_u.s[0] = b;
c_u.s[0] = c;
@@ -72,16 +122,122 @@
vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v);
vec_st( com_sat_v, 0, &src[0]);
src += FDEC_STRIDE;
- i00 += c;
add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
add_i0_b_8v = vec_adds(add_i0_b_8v, c_v);
}
}
+#define PREDICT_16x16_DC_ALTIVEC(v) \
+for (i=0; i<16; i+=2) \
+{ \
+ vec_st(v, 0, src); \
+ vec_st(v, FDEC_STRIDE, src); \
+ src += FDEC_STRIDE*2; \
+}
+
+static void predict_16x16_dc_altivec( uint8_t *src )
+{
+ uint32_t dc = 0;
+ int i;
+
+ for( i = 0; i < 16; i++ )
+ {
+ dc += src[-1 + i * FDEC_STRIDE];
+ dc += src[i - FDEC_STRIDE];
+ }
+ vec_u8_u v ; v.s[0] = (( dc + 16 ) >> 5);
+ vec_u8_t bc_v = vec_splat(v.v, 0);
+
+ PREDICT_16x16_DC_ALTIVEC(bc_v);
+}
+
+static void predict_16x16_dc_left_altivec( uint8_t *src )
+{
+ uint32_t dc = 0;
+ int i;
+
+ for( i = 0; i < 16; i++ )
+ {
+ dc += src[-1 + i * FDEC_STRIDE];
+ }
+ vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4);
+ vec_u8_t bc_v = vec_splat(v.v, 0);
+
+ PREDICT_16x16_DC_ALTIVEC(bc_v);
+}
+
+static void predict_16x16_dc_top_altivec( uint8_t *src )
+{
+ uint32_t dc = 0;
+ int i;
+
+ for( i = 0; i < 16; i++ )
+ {
+ dc += src[i - FDEC_STRIDE];
+ }
+ vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4);
+ vec_u8_t bc_v = vec_splat(v.v, 0);
+
+ PREDICT_16x16_DC_ALTIVEC(bc_v);
+}
+
+static void predict_16x16_dc_128_altivec( uint8_t *src )
+{
+ int i;
+ /* test if generating the constant is faster than loading it.
+ vector unsigned int bc_v = (vector unsigned int)CV(0x80808080, 0x80808080, 0x80808080, 0x80808080);
+ */
+ vec_u8_t bc_v = vec_vslb((vec_u8_t)vec_splat_u8(1),(vec_u8_t)vec_splat_u8(7));
+ PREDICT_16x16_DC_ALTIVEC(bc_v);
+}
+
+static void predict_16x16_h_altivec( uint8_t *src )
+{
+ int i;
+
+ for( i = 0; i < 16; i++ )
+ {
+ vec_u8_t v = vec_ld(-1, src);
+ vec_u8_t v_v = vec_splat(v, 15);
+ vec_st(v_v, 0, src);
+
+ src += FDEC_STRIDE;
+ }
+}
+
+static void predict_16x16_v_altivec( uint8_t *src )
+{
+ vec_u32_u v;
+ v.s[0] = *(uint32_t*)&src[ 0-FDEC_STRIDE];
+ v.s[1] = *(uint32_t*)&src[ 4-FDEC_STRIDE];
+ v.s[2] = *(uint32_t*)&src[ 8-FDEC_STRIDE];
+ v.s[3] = *(uint32_t*)&src[12-FDEC_STRIDE];
+
+ int i;
+
+ for( i = 0; i < 16; i++ )
+ {
+ vec_st(v.v, 0, (uint32_t*)src);
+ src += FDEC_STRIDE;
+ }
+}
+
+
/****************************************************************************
* Exported functions:
****************************************************************************/
void x264_predict_16x16_init_altivec( x264_predict_t pf[7] )
{
- pf[I_PRED_16x16_P] = predict_16x16_p_altivec;
+ pf[I_PRED_16x16_V ] = predict_16x16_v_altivec;
+ pf[I_PRED_16x16_H ] = predict_16x16_h_altivec;
+ pf[I_PRED_16x16_DC] = predict_16x16_dc_altivec;
+ pf[I_PRED_16x16_P ] = predict_16x16_p_altivec;
+ pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec;
+ pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec;
+ pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec;
+}
+
+void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] )
+{
+ pf[I_PRED_CHROMA_P] = predict_8x8c_p_altivec;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/predict.h
^
|
@@ -22,5 +22,6 @@
#define X264_PPC_PREDICT_H
void x264_predict_16x16_init_altivec ( x264_predict_t pf[7] );
+void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] );
#endif /* X264_PPC_PREDICT_H */
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/ppc/quant.c
^
|
@@ -18,10 +18,6 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
-#if defined SYS_LINUX
-#include <altivec.h>
-#endif
-
#include "common/common.h"
#include "ppccommon.h"
#include "quant.h"
@@ -75,7 +71,7 @@
vec_s16_t temp1v, temp2v;
- vect_int_u qbits_u;
+ vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
@@ -129,15 +125,15 @@
vec_u16_t mfv;
vec_u16_t biasv;
- vect_ushort_u mf_u;
+ vec_u16_u mf_u;
mf_u.s[0]=mf;
mfv = vec_splat( mf_u.v, 0 );
- vect_int_u qbits_u;
+ vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
- vect_ushort_u bias_u;
+ vec_u16_u bias_u;
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
@@ -177,15 +173,15 @@
vec_u16_t mfv;
vec_u16_t biasv;
- vect_ushort_u mf_u;
+ vec_u16_u mf_u;
mf_u.s[0]=mf;
mfv = vec_splat( mf_u.v, 0 );
- vect_int_u qbits_u;
+ vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
- vect_ushort_u bias_u;
+ vec_u16_u bias_u;
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
@@ -213,7 +209,7 @@
vec_s16_t temp1v, temp2v;
- vect_int_u qbits_u;
+ vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
@@ -282,7 +278,7 @@
if( i_qbits >= 0 )
{
vec_u16_t i_qbitsv;
- vect_ushort_u qbits_u;
+ vec_u16_u qbits_u;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
@@ -294,17 +290,17 @@
const int f = 1 << (-i_qbits-1);
vec_s32_t fv;
- vect_int_u f_u;
+ vec_u32_u f_u;
f_u.s[0]=f;
fv = (vec_s32_t)vec_splat(f_u.v, 0);
vec_u32_t i_qbitsv;
- vect_int_u qbits_u;
+ vec_u32_u qbits_u;
qbits_u.s[0]=-i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
vec_u32_t sixteenv;
- vect_int_u sixteen_u;
+ vec_u32_u sixteen_u;
sixteen_u.s[0]=16;
sixteenv = vec_splat(sixteen_u.v, 0);
@@ -329,7 +325,7 @@
if( i_qbits >= 0 )
{
vec_u16_t i_qbitsv;
- vect_ushort_u qbits_u;
+ vec_u16_u qbits_u;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
@@ -341,17 +337,17 @@
const int f = 1 << (-i_qbits-1);
vec_s32_t fv;
- vect_int_u f_u;
+ vec_u32_u f_u;
f_u.s[0]=f;
fv = (vec_s32_t)vec_splat(f_u.v, 0);
vec_u32_t i_qbitsv;
- vect_int_u qbits_u;
+ vec_u32_u qbits_u;
qbits_u.s[0]=-i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
vec_u32_t sixteenv;
- vect_int_u sixteen_u;
+ vec_u32_u sixteen_u;
sixteen_u.s[0]=16;
sixteenv = vec_splat(sixteen_u.v, 0);
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/predict.c
^
|
@@ -786,6 +786,13 @@
#ifdef HAVE_MMX
x264_predict_8x8c_init_mmx( cpu, pf );
#endif
+
+#ifdef ARCH_PPC
+ if( cpu&X264_CPU_ALTIVEC )
+ {
+ x264_predict_8x8c_init_altivec( pf );
+ }
+#endif
}
void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] )
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/quant.c
^
|
@@ -273,6 +273,27 @@
return x264_coeff_last_internal( l, 64 );
}
+#define level_run(num)\
+static int x264_coeff_level_run##num( int16_t *dct, x264_run_level_t *runlevel )\
+{\
+ int i_last = runlevel->last = x264_coeff_last##num(dct);\
+ int i_total = 0;\
+ do\
+ {\
+ int r = 0;\
+ runlevel->level[i_total] = dct[i_last];\
+ while( --i_last >= 0 && dct[i_last] == 0 )\
+ r++;\
+ runlevel->run[i_total++] = r;\
+ } while( i_last >= 0 );\
+ return i_total;\
+}
+
+level_run(4)
+level_run(15)
+level_run(16)
+
+
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
@@ -293,6 +314,9 @@
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
@@ -323,8 +347,16 @@
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
#endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
+ }
}
if( cpu&X264_CPU_SSE2 )
@@ -347,6 +379,16 @@
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
+ }
}
if( cpu&X264_CPU_SSSE3 )
@@ -375,4 +417,6 @@
#endif
pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
+ pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
+ pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[ DCT_LUMA_AC];
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/quant.h
^
|
@@ -40,6 +40,7 @@
int (*decimate_score16)( int16_t *dct );
int (*decimate_score64)( int16_t *dct );
int (*coeff_last[6])( int16_t *dct );
+ int (*coeff_level_run[5])( int16_t *dct, x264_run_level_t *runlevel );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/vlc.c
^
|
@@ -21,15 +21,19 @@
#include "common.h"
#define MKVLC( a, b ) { a, b }
-const vlc_t x264_coeff_token[5][17*4] =
+const vlc_t x264_coeff0_token[5] =
+{
+ MKVLC( 0x1, 1 ), /* str=1 */
+ MKVLC( 0x3, 2 ), /* str=11 */
+ MKVLC( 0xf, 4 ), /* str=1111 */
+ MKVLC( 0x3, 6 ), /* str=000011 */
+ MKVLC( 0x1, 2 ) /* str=01 */
+};
+
+const vlc_t x264_coeff_token[5][16*4] =
{
/* table 0 */
{
- MKVLC( 0x1, 1 ), /* str=1 */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
-
MKVLC( 0x5, 6 ), /* str=000101 */
MKVLC( 0x1, 2 ), /* str=01 */
MKVLC( 0x0, 0 ), /* str= */
@@ -113,11 +117,6 @@
/* table 1 */
{
- MKVLC( 0x3, 2 ), /* str=11 */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
-
MKVLC( 0xb, 6 ), /* str=001011 */
MKVLC( 0x2, 2 ), /* str=10 */
MKVLC( 0x0, 0 ), /* str= */
@@ -200,11 +199,6 @@
},
/* table 2 */
{
- MKVLC( 0xf, 4 ), /* str=1111 */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
-
MKVLC( 0xf, 6 ), /* str=001111 */
MKVLC( 0xe, 4 ), /* str=1110 */
MKVLC( 0x0, 0 ), /* str= */
@@ -288,11 +282,6 @@
/* table 3 */
{
- MKVLC( 0x3, 6 ), /* str=000011 */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
-
MKVLC( 0x0, 6 ), /* str=000000 */
MKVLC( 0x1, 6 ), /* str=000001 */
MKVLC( 0x0, 0 ), /* str= */
@@ -376,11 +365,6 @@
/* table 4 */
{
- MKVLC( 0x1, 2 ), /* str=01 */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
- MKVLC( 0x0, 0 ), /* str= */
-
MKVLC( 0x7, 6 ), /* str=000111 */
MKVLC( 0x1, 1 ), /* str=1 */
MKVLC( 0x0, 0 ), /* str= */
@@ -762,7 +746,7 @@
};
/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */
-const vlc_t x264_run_before[7][15] =
+const vlc_t x264_run_before[7][16] =
{
{ /* i_zero_left 1 */
MKVLC( 0x1, 1 ), /* str=1 */
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/cabac-a.asm
^
|
@@ -31,21 +31,12 @@
cextern x264_cabac_transition
cextern x264_cabac_renorm_shift
-%macro DEF_TMP 16
- %rep 8
- %define t%1d r%9d
- %define t%1b r%9b
- %define t%1 r%9
- %rotate 1
- %endrep
-%endmacro
-
; t3 must be ecx, since it's used for shift.
%ifdef ARCH_X86_64
- DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,10
%define pointer resq
%else
- DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
+ DECLARE_REG_TMP 0,3,2,1,4,5,6,3
%define pointer resd
%endif
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/mc-a.asm
^
|
@@ -41,27 +41,13 @@
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
- %define t0 r0
- %define t1 r1
- %define t2 r2
- %define t3 r3
- %define t4 r4
- %define t5 r5
- %define t6d r10d
- %define t7d r11d
+ DECLARE_REG_TMP 0,1,2,3,4,5,10,11
%macro AVG_START 0
PROLOGUE 6,7
.height_loop:
%endmacro
%else
- %define t0 r1
- %define t1 r2
- %define t2 r3
- %define t3 r4
- %define t4 r5
- %define t5 r6
- %define t6d r1d
- %define t7d r2d
+ DECLARE_REG_TMP 1,2,3,4,5,6,1,2
%macro AVG_START 0
PROLOGUE 0,7
mov t0, r0m
@@ -690,12 +676,11 @@
; chroma MC
;=============================================================================
- %define t0d eax
- %define t0 rax
+ %define t0 rax
%ifdef ARCH_X86_64
- %define t1d r10d
+ %define t1 r10
%else
- %define t1d r1d
+ %define t1 r1
%endif
%macro MC_CHROMA_START 0
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/mc-a2.asm
^
|
@@ -694,6 +694,104 @@
+;-----------------------------------------------------------------------------
+; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4h_sse4, 3,4
+ lea r3, [r0+r2*2]
+ add r1, r2
+ neg r2
+ pxor m4, m4
+.loop:
+ movdqa m0, [r1+r2]
+ movdqu m1, [r1+r2+8]
+ mpsadbw m0, m4, 0
+ mpsadbw m1, m4, 0
+ paddw m0, [r0+r2*2]
+ paddw m1, [r0+r2*2+16]
+ movdqa [r3+r2*2 ], m0
+ movdqa [r3+r2*2+16], m1
+ add r2, 16
+ jl .loop
+ REP_RET
+
+cglobal x264_integral_init8h_sse4, 3,4
+ lea r3, [r0+r2*2]
+ add r1, r2
+ neg r2
+ pxor m4, m4
+.loop:
+ movdqa m0, [r1+r2]
+ movdqu m1, [r1+r2+8]
+ movdqa m2, m0
+ movdqa m3, m1
+ mpsadbw m0, m4, 0
+ mpsadbw m1, m4, 0
+ mpsadbw m2, m4, 4
+ mpsadbw m3, m4, 4
+ paddw m0, [r0+r2*2]
+ paddw m1, [r0+r2*2+16]
+ paddw m0, m2
+ paddw m1, m3
+ movdqa [r3+r2*2 ], m0
+ movdqa [r3+r2*2+16], m1
+ add r2, 16
+ jl .loop
+ REP_RET
+
+%macro INTEGRAL_INIT 1
+;-----------------------------------------------------------------------------
+; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4v_%1, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ movu m0, [r0+r2+8]
+ mova m2, [r0+r2]
+ movu m1, [r4+r2+8]
+ paddw m0, m2
+ paddw m1, [r4+r2]
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, mmsize
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init8v_%1, 3,3
+ shl r1, 1
+ add r0, r1
+ lea r2, [r0+r1*8]
+ neg r1
+.loop:
+ mova m0, [r2+r1]
+ mova m1, [r2+r1+mmsize]
+ psubw m0, [r0+r1]
+ psubw m1, [r0+r1+mmsize]
+ mova [r0+r1], m0
+ mova [r0+r1+mmsize], m1
+ add r1, 2*mmsize
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+INTEGRAL_INIT mmx
+INIT_XMM
+INTEGRAL_INIT sse2
+
+
+
%macro FILT8x4 7
mova %3, [r0+%7]
mova %4, [r0+r5+%7]
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/mc-c.c
^
|
@@ -64,6 +64,12 @@
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
extern void x264_memzero_aligned_mmx( void * dst, int n );
extern void x264_memzero_aligned_sse2( void * dst, int n );
+extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
+extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
@@ -196,16 +202,14 @@
void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
void x264_sfence( void );\
static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
- int stride, int width, int height )\
+ int stride, int width, int height, int16_t *buf )\
{\
- int16_t *buf;\
int realign = (long)src & (align-1);\
src -= realign;\
dstv -= realign;\
dstc -= realign;\
dsth -= realign;\
width += realign;\
- buf = x264_malloc((width+16)*sizeof(int16_t));\
while( height-- )\
{\
x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
@@ -217,14 +221,13 @@
src += stride;\
}\
x264_sfence();\
- x264_free(buf);\
}
HPEL(8, mmxext, mmxext, mmxext, mmxext)
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
#ifdef ARCH_X86_64
-void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
-void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
+void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, sse2, ssse3, ssse3)
@@ -242,6 +245,8 @@
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
+ pf->integral_init4v = x264_integral_init4v_mmx;
+ pf->integral_init8v = x264_integral_init8v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
@@ -286,6 +291,8 @@
pf->memcpy_aligned = x264_memcpy_aligned_sse2;
pf->memzero_aligned = x264_memzero_aligned_sse2;
+ pf->integral_init4v = x264_integral_init4v_sse2;
+ pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
if( cpu&X264_CPU_SSE2_IS_SLOW )
@@ -331,4 +338,10 @@
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
+
+ if( !(cpu&X264_CPU_SSE4) )
+ return;
+
+ pf->integral_init4h = x264_integral_init4h_sse4;
+ pf->integral_init8h = x264_integral_init8h_sse4;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/pixel-a.asm
^
|
@@ -230,20 +230,15 @@
pxor m6, m6 ; sum squared
pxor m7, m7 ; zero
%ifdef ARCH_X86_64
- %define t3d r3d
+ %define t3 r3
%else
- %define t3d r2d
+ %define t3 r2
%endif
%endmacro
%macro VAR_END 1
-%if mmsize == 16
- movhlps m0, m5
- paddw m5, m0
-%endif
- movifnidn r2d, r2m
+ HADDW m5, m7
movd r1d, m5
- movd [r2], m5 ; return sum
imul r1d, r1d
HADDD m6, m1
shr r1d, %1
@@ -258,27 +253,25 @@
mova m0, [r0]
mova m1, m0
mova m3, [r0+%1]
- mova m2, m0
- punpcklbw m0, m7
mova m4, m3
+ punpcklbw m0, m7
punpckhbw m1, m7
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
- punpckhbw m4, m7
- psadbw m2, m7
- paddw m5, m2
- mova m2, m3
punpcklbw m3, m7
+ punpckhbw m4, m7
+ paddw m5, m0
dec t3d
- psadbw m2, m7
pmaddwd m0, m0
- paddw m5, m2
+ paddw m5, m1
pmaddwd m1, m1
+ paddw m5, m3
paddd m6, m0
pmaddwd m3, m3
+ paddw m5, m4
paddd m6, m1
pmaddwd m4, m4
paddd m6, m3
@@ -287,7 +280,7 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
@@ -315,13 +308,12 @@
lea r0, [r0+r1*2]
mova m1, m0
punpcklbw m0, m7
- mova m2, m1
punpckhbw m1, m7
dec t3d
+ paddw m5, m0
+ paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
- psadbw m2, m7
- paddw m5, m2
paddd m6, m0
paddd m6, m1
jnz .loop
@@ -1036,15 +1028,13 @@
; stack is 16 byte aligned because abi says so
%define top_1d rsp-8 ; size 8
%define left_1d rsp-16 ; size 8
- %define t0 r10
- %define t0d r10d
+ %define t0 r10
%else
; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
SUB esp, 16
%define top_1d esp+8
%define left_1d esp
- %define t0 r2
- %define t0d r2d
+ %define t0 r2
%endif
call load_hadamard
@@ -1076,17 +1066,11 @@
RET
%ifdef ARCH_X86_64
- %define t0 r10
- %define t0d r10d
- %define t2 r11
- %define t2w r11w
- %define t2d r11d
+ %define t0 r10
+ %define t2 r11
%else
- %define t0 r0
- %define t0d r0d
- %define t2 r2
- %define t2w r2w
- %define t2d r2d
+ %define t0 r0
+ %define t2 r2
%endif
;-----------------------------------------------------------------------------
@@ -1739,10 +1723,10 @@
%macro ADS_START 1 ; unroll_size
%ifdef ARCH_X86_64
- %define t0 r6
+ %define t0 r6
mov r10, rsp
%else
- %define t0 r4
+ %define t0 r4
mov rbp, rsp
%endif
mov r0d, r5m
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/pixel.h
^
|
@@ -67,8 +67,8 @@
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride, uint32_t *sad ))
-DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride, uint32_t *sad ))
+DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/quant-a.asm
^
|
@@ -241,19 +241,9 @@
%endmacro
%ifdef ARCH_X86_64
- %define t0 r4
- %define t0d r4d
- %define t1 r3
- %define t1d r3d
- %define t2 r2
- %define t2d r2d
-%else
- %define t0 r2
- %define t0d r2d
- %define t1 r0
- %define t1d r0d
- %define t2 r1
- %define t2d r1d
+ DECLARE_REG_TMP 4,3,2
+%else
+ DECLARE_REG_TMP 2,0,1
%endif
%macro DEQUANT_START 2
@@ -672,9 +662,12 @@
DECIMATE8x8 sse2
DECIMATE8x8 ssse3
+;-----------------------------------------------------------------------------
+; int x264_coeff_last( int16_t *dct )
+;-----------------------------------------------------------------------------
+
%macro LAST_MASK_SSE2 2-3
movdqa xmm0, [%2+ 0]
- pxor xmm2, xmm2
packsswb xmm0, [%2+16]
pcmpeqb xmm0, xmm2
pmovmskb %1, xmm0
@@ -683,7 +676,6 @@
%macro LAST_MASK_MMX 3
movq mm0, [%2+ 0]
movq mm1, [%2+16]
- pxor mm2, mm2
packsswb mm0, [%2+ 8]
packsswb mm1, [%2+24]
pcmpeqb mm0, mm2
@@ -694,45 +686,60 @@
or %1, %3
%endmacro
+%macro LAST_X86 3
+ bsr %1, %2
+%endmacro
+
+%macro LAST_SSE4A 3
+ lzcnt %1, %2
+ xor %1, %3
+%endmacro
+
+%macro COEFF_LAST4 1
%ifdef ARCH_X86_64
-cglobal x264_coeff_last4_mmxext, 1,1
- bsr rax, [r0]
+cglobal x264_coeff_last4_%1, 1,1
+ LAST rax, [r0], 0x3f
shr eax, 4
RET
%else
-cglobal x264_coeff_last4_mmxext, 0,3
+cglobal x264_coeff_last4_%1, 0,3
mov edx, r0m
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
cmovz eax, [edx]
setnz cl
- bsr eax, eax
+ LAST eax, eax, 0x1f
shr eax, 4
lea eax, [eax+ecx*2]
RET
%endif
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST4 mmxext
+%define LAST LAST_SSE4A
+COEFF_LAST4 mmxext_lzcnt
%macro COEFF_LAST 1
cglobal x264_coeff_last15_%1, 1,3
+ pxor m2, m2
LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff
- bsr eax, r1d
+ LAST eax, r1d, 0x1f
dec eax
RET
cglobal x264_coeff_last16_%1, 1,3
+ pxor m2, m2
LAST_MASK r1d, r0, r2d
xor r1d, 0xffff
- bsr eax, r1d
+ LAST eax, r1d, 0x1f
RET
%ifndef ARCH_X86_64
-%ifidn %1, mmxext
- cglobal x264_coeff_last64_%1, 1,5
-%else
- cglobal x264_coeff_last64_%1, 1,4
-%endif
+cglobal x264_coeff_last64_%1, 1, 5-mmsize/16
+ pxor m2, m2
LAST_MASK r1d, r0, r4d
LAST_MASK r2d, r0+32, r4d
shl r2d, 16
@@ -744,17 +751,15 @@
not r1d
xor r2d, -1
jne .secondhalf
- bsr eax, r1d
+ LAST eax, r1d, 0x1f
RET
.secondhalf:
- bsr eax, r2d
+ LAST eax, r2d, 0x1f
add eax, 32
RET
-%endif
-%endmacro
-
-%ifdef ARCH_X86_64
- cglobal x264_coeff_last64_sse2, 1,4
+%else
+cglobal x264_coeff_last64_%1, 1,4
+ pxor m2, m2
LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32
LAST_MASK_SSE2 r3d, r0+64
@@ -766,13 +771,94 @@
shl r3, 32
or r1, r3
not r1
- bsr rax, r1
+ LAST rax, r1, 0x3f
RET
%endif
+%endmacro
+%define LAST LAST_X86
%ifndef ARCH_X86_64
+INIT_MMX
%define LAST_MASK LAST_MASK_MMX
COEFF_LAST mmxext
%endif
+INIT_XMM
%define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2
+%define LAST LAST_SSE4A
+COEFF_LAST sse2_lzcnt
+
+;-----------------------------------------------------------------------------
+; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
+;-----------------------------------------------------------------------------
+
+%macro LAST_MASK4_MMX 2-3
+ movq mm0, [%2]
+ packsswb mm0, mm0
+ pcmpeqb mm0, mm2
+ pmovmskb %1, mm0
+%endmacro
+
+%macro LZCOUNT_X86 3
+ bsr %1, %2
+ xor %1, %3
+%endmacro
+
+%macro LZCOUNT_SSE4A 3
+ lzcnt %1, %2
+%endmacro
+
+; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
+%ifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,4,5,6
+%else
+ DECLARE_REG_TMP 6,3,2,1,4,5,0
+%endif
+
+%macro COEFF_LEVELRUN 2
+cglobal x264_coeff_level_run%2_%1,0,7
+ movifnidn t0d, r0m
+ movifnidn t1d, r1m
+ pxor m2, m2
+ LAST_MASK t5d, t0-(%2&1)*2, t4d
+ not t5d
+ shl t5d, 32-((%2+1)&~1)
+ mov t4d, %2-1
+ LZCOUNT t3d, t5d, 0x1f
+ xor t6d, t6d
+ add t5d, t5d
+ sub t4d, t3d
+ shl t5d, t3b
+ mov [t1], t4d
+.loop:
+ LZCOUNT t3d, t5d, 0x1f
+ mov t2w, [t0+t4*2]
+ mov [t1+t6 +36], t3b
+ mov [t1+t6*2+ 4], t2w
+ inc t3d
+ shl t5d, t3b
+ inc t6d
+ sub t4d, t3d
+ jge .loop
+ RET
+%endmacro
+
+INIT_MMX
+%define LZCOUNT LZCOUNT_X86
+%ifndef ARCH_X86_64
+%define LAST_MASK LAST_MASK_MMX
+COEFF_LEVELRUN mmxext, 15
+COEFF_LEVELRUN mmxext, 16
+%endif
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext, 4
+INIT_XMM
+%define LAST_MASK LAST_MASK_SSE2
+COEFF_LEVELRUN sse2, 15
+COEFF_LEVELRUN sse2, 16
+%define LZCOUNT LZCOUNT_SSE4A
+COEFF_LEVELRUN sse2_lzcnt, 15
+COEFF_LEVELRUN sse2_lzcnt, 16
+INIT_MMX
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext_lzcnt, 4
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/quant.h
^
|
@@ -64,5 +64,17 @@
int x264_coeff_last15_sse2( int16_t *dct );
int x264_coeff_last16_sse2( int16_t *dct );
int x264_coeff_last64_sse2( int16_t *dct );
+int x264_coeff_last4_mmxext_lzcnt( int16_t *dct );
+int x264_coeff_last15_sse2_lzcnt( int16_t *dct );
+int x264_coeff_last16_sse2_lzcnt( int16_t *dct );
+int x264_coeff_last64_sse2_lzcnt( int16_t *dct );
+int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/common/x86/x86inc.asm
^
|
@@ -116,6 +116,29 @@
DECLARE_REG_SIZE di, dil
DECLARE_REG_SIZE bp, bpl
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
+
%ifdef ARCH_X86_64
%define gprsize 8
%else
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/configure
^
|
@@ -263,7 +263,7 @@
then
ALTIVECFLAGS="$ALTIVECFLAGS -faltivec -fastf -mcpu=G4"
else
- ALTIVECFLAGS="$ALTIVECFLAGS -maltivec -mabi=altivec"
+ ALTIVECFLAGS="$ALTIVECFLAGS -maltivec -mabi=altivec -DHAVE_ALTIVEC_H"
fi
;;
sparc)
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/analyse.c
^
|
@@ -22,6 +22,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
+#define _ISOC99_SOURCE
#include <math.h>
#include <limits.h>
#ifndef _MSC_VER
@@ -29,6 +30,7 @@
#endif
#include "common/common.h"
+#include "common/cpu.h"
#include "macroblock.h"
#include "me.h"
#include "ratecontrol.h"
@@ -77,6 +79,8 @@
int i_lambda2;
int i_qp;
int16_t *p_cost_mv;
+ uint16_t *p_cost_ref0;
+ uint16_t *p_cost_ref1;
int i_mbrd;
@@ -168,6 +172,7 @@
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
uint16_t *x264_cost_mv_fpel[52][4];
+uint16_t x264_cost_ref[52][3][33];
/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
@@ -177,6 +182,7 @@
if( !p_cost_mv[a->i_qp] )
{
+ x264_emms();
/* could be faster, but isn't called many times */
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
p_cost_mv[a->i_qp] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) );
@@ -184,10 +190,15 @@
for( i = 0; i <= 2*4*2048; i++ )
{
p_cost_mv[a->i_qp][-i] =
- p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i );
+ p_cost_mv[a->i_qp][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
}
+ for( i = 0; i < 3; i++ )
+ for( j = 0; j < 33; j++ )
+ x264_cost_ref[a->i_qp][i][j] = a->i_lambda * bs_size_te( i, j );
}
a->p_cost_mv = p_cost_mv[a->i_qp];
+ a->p_cost_ref0 = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+ a->p_cost_ref1 = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
/* FIXME is this useful for all me methods? */
if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] )
@@ -727,8 +738,9 @@
}
else
{
+ static const uint16_t cost_div_fix8[3] = {1024,512,341};
a->i_satd_i8x8 = COST_MAX;
- i_cost = i_cost * 4/(idx+1);
+ i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
}
if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
return;
@@ -1037,7 +1049,7 @@
(m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
#define REF_COST(list, ref) \
- (a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref ))
+ (a->p_cost_ref##list[ref])
static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
@@ -2464,11 +2476,7 @@
{
if( !h->mb.b_direct_auto_write )
x264_mb_mc( h );
- if( h->mb.b_lossless )
- {
- /* chance of skip is too small to bother */
- }
- else if( analysis.i_mbrd )
+ if( analysis.i_mbrd )
{
i_bskip_cost = ssd_mb( h );
/* 6 = minimum cavlc cost of a non-skipped MB */
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/cabac.c
^
|
@@ -50,9 +50,7 @@
x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
if( h->mb.i_cbp_chroma == 0 )
- {
x264_cabac_encode_decision_noup( cb, ctx2, 0 );
- }
else
{
x264_cabac_encode_decision( cb, ctx2, 1 );
@@ -77,13 +75,9 @@
{
int ctx = 0;
if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 )
- {
ctx++;
- }
if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 )
- {
ctx++;
- }
x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
}
@@ -130,18 +124,12 @@
{
int ctx = 0;
if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
- {
ctx++;
- }
if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
- {
ctx++;
- }
if( i_mb_type == B_DIRECT )
- {
x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
- }
else if( i_mb_type == B_8x8 )
{
x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
@@ -210,18 +198,12 @@
static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode )
{
if( i_pred == i_mode )
- {
- /* b_prev_intra4x4_pred_mode */
x264_cabac_encode_decision( cb, 68, 1 );
- }
else
{
- /* b_prev_intra4x4_pred_mode */
x264_cabac_encode_decision( cb, 68, 0 );
if( i_mode > i_pred )
- {
i_mode--;
- }
x264_cabac_encode_decision( cb, 69, (i_mode )&0x01 );
x264_cabac_encode_decision( cb, 69, (i_mode >> 1)&0x01 );
x264_cabac_encode_decision( cb, 69, (i_mode >> 2)&0x01 );
@@ -235,22 +217,16 @@
/* No need to test for I4x4 or I_16x16 as cache_save handle that */
if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_xy - 1] != 0 )
- {
ctx++;
- }
if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 )
- {
ctx++;
- }
x264_cabac_encode_decision_noup( cb, 64 + ctx, i_mode > 0 );
if( i_mode > 0 )
{
x264_cabac_encode_decision( cb, 64 + 3, i_mode > 1 );
if( i_mode > 1 )
- {
x264_cabac_encode_decision_noup( cb, 64 + 3, i_mode > 2 );
- }
}
}
@@ -273,22 +249,16 @@
/* No need to test for SKIP/PCM */
if( h->mb.i_neighbour & MB_LEFT )
- {
cbp_a = (h->mb.cbp[h->mb.i_mb_xy - 1] >> 4)&0x3;
- }
if( h->mb.i_neighbour & MB_TOP )
- {
cbp_b = (h->mb.cbp[h->mb.i_mb_top_xy] >> 4)&0x3;
- }
ctx = 0;
if( cbp_a > 0 ) ctx++;
if( cbp_b > 0 ) ctx += 2;
if( h->mb.i_cbp_chroma == 0 )
- {
x264_cabac_encode_decision_noup( cb, 77 + ctx, 0 );
- }
else
{
x264_cabac_encode_decision_noup( cb, 77 + ctx, 1 );
@@ -316,11 +286,8 @@
}
/* No need to test for PCM / SKIP */
- if( h->mb.i_last_dqp &&
- ( h->mb.type[i_mbn_xy] == I_16x16 || (h->mb.cbp[i_mbn_xy]&0x3f) ) )
- ctx = 1;
- else
- ctx = 0;
+ ctx = h->mb.i_last_dqp &&
+ ( h->mb.type[i_mbn_xy] == I_16x16 || (h->mb.cbp[i_mbn_xy]&0x3f) );
if( i_dqp != 0 )
{
@@ -331,10 +298,7 @@
while( val-- )
{
x264_cabac_encode_decision( cb, 60 + ctx, 1 );
- if( ctx < 2 )
- ctx = 2;
- else
- ctx = 3;
+ ctx = 2+(ctx>>1);
}
}
x264_cabac_encode_decision_noup( cb, 60 + ctx, 0 );
@@ -353,9 +317,7 @@
static inline void x264_cabac_mb_sub_p_partition( x264_cabac_t *cb, int i_sub )
{
if( i_sub == D_L0_8x8 )
- {
x264_cabac_encode_decision( cb, 21, 1 );
- }
else if( i_sub == D_L0_8x4 )
{
x264_cabac_encode_decision( cb, 21, 0 );
@@ -434,11 +396,7 @@
while( i_ref > 0 )
{
x264_cabac_encode_decision( cb, 54 + ctx, 1 );
- if( ctx < 4 )
- ctx = 4;
- else
- ctx = 5;
-
+ ctx = (ctx>>2)+4;
i_ref--;
}
x264_cabac_encode_decision( cb, 54 + ctx, 0 );
@@ -678,6 +636,7 @@
{ 4, 4, 4, 4, 5, 6, 7, 7 }
};
+#if !RDO_SKIP_BS
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
@@ -734,9 +693,7 @@
if( i == i_last )
{
i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;
-#if !RDO_SKIP_BS
i_coeff_sign[i_coeff] = l[i] < 0;
-#endif
i_coeff++;
}
@@ -753,15 +710,10 @@
{
x264_cabac_encode_decision( cb, ctx, 1 );
ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
-#if RDO_SKIP_BS
- cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
- cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
-#else
for( i = 0; i < i_prefix - 1; i++ )
x264_cabac_encode_decision( cb, ctx, 1 );
if( i_prefix < 14 )
x264_cabac_encode_decision( cb, ctx, 0 );
-#endif
if( i_prefix >= 14 )
x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 );
@@ -771,18 +723,110 @@
{
x264_cabac_encode_decision( cb, ctx, 0 );
node_ctx = coeff_abs_level_transition[0][node_ctx];
-#if RDO_SKIP_BS
- x264_cabac_encode_bypass( cb, 0 ); // sign
-#endif
}
-#if !RDO_SKIP_BS
x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] );
-#endif
} while( i_coeff > 0 );
}
+#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 )
+
+#else
+
+/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct
+ * this is slightly incorrect because the sigmap is not reversible
+ * (contexts are repeated). However, there is nearly no quality penalty
+ * for this (~0.001db) and the speed boost (~30%) is worth it. */
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 )
+{
+ const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
+ const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
+ const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat];
+ const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced];
+ int i_last, i_coeff_abs_m1, ctx, i_prefix, i, node_ctx;
+ if( !b_8x8 )
+ {
+ /* coded block flag */
+ ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
+ if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
+ x264_cabac_encode_decision( cb, ctx, 1 );
+ else
+ {
+ x264_cabac_encode_decision( cb, ctx, 0 );
+ return;
+ }
+ }
+ i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
+
+ i_coeff_abs_m1 = abs(l[i_last]) - 1;
+ i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
+ ctx = coeff_abs_level1_ctx[0] + i_ctx_level;
+
+ if( i_last != i_count - 1 )
+ {
+ x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i_last]:i_last), 1 );
+ x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 );
+ }
+
+ if( i_prefix )
+ {
+ x264_cabac_encode_decision( cb, ctx, 1 );
+ ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level;
+ cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
+ cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
+ if( i_prefix >= 14 )
+ x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
+ node_ctx = coeff_abs_level_transition[1][0];
+ }
+ else
+ {
+ x264_cabac_encode_decision( cb, ctx, 0 );
+ node_ctx = coeff_abs_level_transition[0][0];
+ x264_cabac_encode_bypass( cb, 0 ); // sign
+ }
+
+ for( i = i_last-1 ; i >= 0; i-- )
+ {
+ if( l[i] )
+ {
+ x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 1 );
+ x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 );
+ ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level;
+
+ if( (unsigned)(l[i]+1) > 2 )
+ {
+ i_coeff_abs_m1 = abs(l[i]) - 1;
+ i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
+ x264_cabac_encode_decision( cb, ctx, 1 );
+ ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
+ cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
+ cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
+ if( i_prefix >= 14 )
+ x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
+ node_ctx = coeff_abs_level_transition[1][node_ctx];
+ }
+ else
+ {
+ x264_cabac_encode_decision( cb, ctx, 0 );
+ node_ctx = coeff_abs_level_transition[0][node_ctx];
+ x264_cabac_encode_bypass( cb, 0 );
+ }
+ }
+ else
+ x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 0 );
+ }
+}
+
+static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l )
+{
+ block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 );
+}
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+{
+ block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 );
+}
+#endif
void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
@@ -923,18 +967,11 @@
else if( i_mb_type != B_DIRECT )
{
/* All B mode */
- int b_list[2][2];
-
- /* init ref list utilisations */
- for( i = 0; i < 2; i++ )
- {
- b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
- b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
- }
+ const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
for( i_list = 0; i_list < 2; i_list++ )
{
- const int i_ref_max = i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1];
+ const int i_ref_max = h->mb.pic.i_fref[i_list];
if( i_ref_max > 1 )
{
@@ -1008,7 +1045,7 @@
{
for( i = 0; i < 4; i++ )
if( h->mb.i_cbp_luma & ( 1 << i ) )
- block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i, h->dct.luma8x8[i], 64 );
+ block_residual_write_cabac_8x8( h, cb, i, h->dct.luma8x8[i] );
}
else
{
@@ -1054,8 +1091,8 @@
x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
{
- if( x264_mb_type_list0_table[ i_mb_type ][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
- if( x264_mb_type_list1_table[ i_mb_type ][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
+ if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
+ if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
}
else if( i_mb_type == B_8x8 )
{
@@ -1073,7 +1110,7 @@
if( h->mb.i_cbp_luma & (1 << i8) )
{
if( h->mb.b_transform_8x8 )
- block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
+ block_residual_write_cabac_8x8( h, cb, i8, h->dct.luma8x8[i8] );
else
{
int i4;
@@ -1112,7 +1149,7 @@
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
- block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
+ block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] );
}
else
{
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/cavlc.c
^
|
@@ -96,7 +96,7 @@
/* Weight highly against overflows. */
s->i_bits_encoded += 1000000;
#else
- x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
+ x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code );
/* clip level, preserving sign */
i_level_code = (1<<12) - 2 + (i_level_code & 1);
#endif
@@ -116,8 +116,8 @@
{
static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
- int level[16], run[16];
- int i_trailing, i_total_zero, i_last, i_suffix_length, i;
+ x264_run_level_t runlevel;
+ int i_trailing, i_total_zero, i_suffix_length, i;
int i_total = 0;
unsigned int i_sign;
/* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
@@ -125,40 +125,30 @@
if( !h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
{
- bs_write_vlc( s, x264_coeff_token[nC][0] );
+ bs_write_vlc( s, x264_coeff0_token[nC] );
return;
}
- i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
- i_total_zero = i_last + 1;
-
/* level and run and total */
/* set these to 2 to allow branchless i_trailing calculation */
- level[1] = 2;
- level[2] = 2;
- do
- {
- int r = 0;
- level[i_total] = l[i_last];
- while( --i_last >= 0 && l[i_last] == 0 )
- r++;
- run[i_total++] = r;
- } while( i_last >= 0 );
+ runlevel.level[1] = 2;
+ runlevel.level[2] = 2;
+ i_total = h->quantf.coeff_level_run[i_ctxBlockCat]( l, &runlevel );
+ i_total_zero = runlevel.last + 1 - i_total;
h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
- i_total_zero -= i_total;
- i_trailing = ((((level[0]+1) | (1-level[0])) >> 31) & 1) // abs(level[0])>1
- | ((((level[1]+1) | (1-level[1])) >> 31) & 2)
- | ((((level[2]+1) | (1-level[2])) >> 31) & 4);
+ i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
+ | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
+ | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
i_trailing = ctz_index[i_trailing];
- i_sign = ((level[2] >> 31) & 1)
- | ((level[1] >> 31) & 2)
- | ((level[0] >> 31) & 4);
+ i_sign = ((runlevel.level[2] >> 31) & 1)
+ | ((runlevel.level[1] >> 31) & 2)
+ | ((runlevel.level[0] >> 31) & 4);
i_sign >>= 3-i_trailing;
/* total/trailing */
- bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing] );
+ bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing-4] );
i_suffix_length = i_total > 10 && i_trailing < 3;
if( i_trailing > 0 || RDO_SKIP_BS )
@@ -166,10 +156,10 @@
if( i_trailing < i_total )
{
- int16_t val = level[i_trailing];
- int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2;
+ int16_t val = runlevel.level[i_trailing];
+ int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
if( i_trailing < 3 )
- val -= (val>>15)|1; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+ val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
val += LEVEL_TABLE_SIZE/2;
if( (unsigned)val_original < LEVEL_TABLE_SIZE )
@@ -181,7 +171,7 @@
i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
for( i = i_trailing+1; i < i_total; i++ )
{
- val = level[i] + LEVEL_TABLE_SIZE/2;
+ val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
if( (unsigned)val < LEVEL_TABLE_SIZE )
{
bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
@@ -203,8 +193,8 @@
for( i = 0; i < i_total-1 && i_total_zero > 0; i++ )
{
int i_zl = X264_MIN( i_total_zero - 1, 6 );
- bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
- i_total_zero -= run[i];
+ bs_write_vlc( s, x264_run_before[i_zl][runlevel.run[i]] );
+ i_total_zero -= runlevel.run[i];
}
}
@@ -441,17 +431,17 @@
}
else if( i_mb_type == P_8x8 )
{
- int b_sub_ref0;
+ int b_sub_ref;
if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
{
bs_write_ue( s, 4 );
- b_sub_ref0 = 0;
+ b_sub_ref = 0;
}
else
{
bs_write_ue( s, 3 );
- b_sub_ref0 = 1;
+ b_sub_ref = h->mb.pic.i_fref[0] > 1;
}
/* sub mb type */
@@ -462,7 +452,7 @@
bs_write( s, 4, 0xf );
/* ref0 */
- if( h->mb.pic.i_fref[0] > 1 && b_sub_ref0 )
+ if( b_sub_ref )
{
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
@@ -501,15 +491,7 @@
/* Motion Vector */
int i_list;
DECLARE_ALIGNED_4( int16_t mvp[2] );
-
- int b_list[2][2];
-
- /* init ref list utilisations */
- for( i = 0; i < 2; i++ )
- {
- b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
- b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
- }
+ const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
@@ -655,8 +637,8 @@
cavlc_mb_mvd( h, &s, 0, 4*i8, 4>>b_8x16 );
else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
{
- if( x264_mb_type_list0_table[ i_mb_type ][!!i8] ) cavlc_mb_mvd( h, &s, 0, 4*i8, 4>>b_8x16 );
- if( x264_mb_type_list1_table[ i_mb_type ][!!i8] ) cavlc_mb_mvd( h, &s, 1, 4*i8, 4>>b_8x16 );
+ if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, &s, 0, 4*i8, 4>>b_8x16 );
+ if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, &s, 1, 4*i8, 4>>b_8x16 );
}
else if( i_mb_type == B_8x8 )
{
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/encoder.c
^
|
@@ -411,6 +411,7 @@
h->param.analyse.b_fast_pskip = 0;
h->param.analyse.i_noise_reduction = 0;
h->param.analyse.f_psy_rd = 0;
+ h->param.i_bframe = 0;
/* 8x8dct is not useful at all in CAVLC lossless */
if( !h->param.b_cabac )
h->param.analyse.b_transform_8x8 = 0;
@@ -713,6 +714,7 @@
|| h->param.i_bframe_adaptive
|| h->param.b_pre_scenecut );
h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
+ h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
h->frames.i_last_idr = - h->param.i_keyint_max;
h->frames.i_input = 0;
@@ -824,7 +826,9 @@
COPY( analyse.intra );
COPY( analyse.inter );
COPY( analyse.i_direct_mv_pred );
- COPY( analyse.i_me_range );
+ /* Scratch buffer prevents me_range from being increased for esa/tesa */
+ if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
+ COPY( analyse.i_me_range );
COPY( analyse.i_noise_reduction );
/* We can't switch out of subme=0 during encoding. */
if( h->param.analyse.i_subpel_refine )
@@ -839,6 +843,8 @@
// can only twiddle these if they were enabled to begin with:
if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA )
COPY( analyse.i_me_method );
+ if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa )
+ h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
if( h->pps->b_transform_8x8_mode )
COPY( analyse.b_transform_8x8 );
if( h->frames.i_max_ref1 > 1 )
@@ -1046,7 +1052,7 @@
x264_pixel_ssim_wxh( &h->pixf,
h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
- h->param.i_width-2, max_y-min_y );
+ h->param.i_width-2, max_y-min_y, h->scratch_buffer );
}
}
@@ -1433,7 +1439,7 @@
return 0;
}
- x264_slicetype_decide( h );
+ x264_stack_align( x264_slicetype_decide, h );
/* 3: move some B-frames and 1 non-B to encode queue */
while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) )
@@ -1976,8 +1982,8 @@
for( i = 0; i < X264_PARTTYPE_MAX; i++ )
for( j = 0; j < 2; j++ )
{
- int l0 = x264_mb_type_list0_table[i][j];
- int l1 = x264_mb_type_list1_table[i][j];
+ int l0 = x264_mb_type_list_table[i][0][j];
+ int l1 = x264_mb_type_list_table[i][1][j];
if( l0 || l1 )
list_count[l1+l0*l1] += h->stat.i_mb_count[SLICE_TYPE_B][i] * 2;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/me.c
^
|
@@ -474,8 +474,7 @@
DECLARE_ALIGNED_16( int enc_dc[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
- int16_t xs_buf[64];
- int16_t *xs = width<=64 ? xs_buf : x264_malloc( (width+15)*sizeof(int16_t) );
+ int16_t *xs = h->scratch_buffer;
int xn;
uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
@@ -492,11 +491,7 @@
if( h->mb.i_me_method == X264_ME_TESA )
{
// ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
- typedef struct {
- int sad;
- int16_t mx, my;
- } mvsad_t;
- mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
+ mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
@@ -581,7 +576,6 @@
}
for( i=0; i<nmvsad; i++ )
COST_MV( mvsads[i].mx, mvsads[i].my );
- x264_free( mvsads );
}
else
{
@@ -601,9 +595,6 @@
COST_MV( min_x+xs[i], my );
}
}
-
- if( xs != xs_buf )
- x264_free( xs );
#endif
}
break;
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/me.h
^
|
@@ -48,6 +48,11 @@
DECLARE_ALIGNED_4( int16_t mv[2] );
} DECLARE_ALIGNED_16( x264_me_t );
+typedef struct {
+ int sad;
+ int16_t mx, my;
+} mvsad_t;
+
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
{ x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/ratecontrol.c
^
|
@@ -174,8 +174,8 @@
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
- unsigned int var=0, sad, i;
- for( i=0; i<3; i++ )
+ unsigned int var = 0, i;
+ for( i = 0; i < 3; i++ )
{
int w = i ? 8 : 16;
int stride = frame->i_stride[i];
@@ -184,7 +184,7 @@
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
- var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad );
+ var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
}
var = X264_MAX(var,1);
x264_emms();
@@ -441,6 +441,12 @@
if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR )
x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" );
+ if( !strstr( opts, "direct=3" ) && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO )
+ {
+ x264_log( h, X264_LOG_WARNING, "direct=auto not used on the first pass\n" );
+ h->mb.b_direct_auto_write = 1;
+ }
+
if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS )
h->param.i_bframe_adaptive = i;
else if( h->param.i_bframe )
@@ -612,7 +618,7 @@
p += len;
if( !*p )
return 0;
- z->param = malloc( sizeof(x264_param_t) );
+ z->param = x264_malloc( sizeof(x264_param_t) );
memcpy( z->param, &h->param, sizeof(x264_param_t) );
while( (tok = strtok_r( p, ",", &saveptr )) )
{
@@ -1507,7 +1513,7 @@
expected_size = qscale2bits(&rce, q);
expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size;
}
- rcc->last_satd = x264_rc_analyse_slice( h );
+ rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h );
}
q = x264_clip3f( q, lmin, lmax );
}
@@ -1525,7 +1531,7 @@
double wanted_bits, overflow=1, lmin, lmax;
- rcc->last_satd = x264_rc_analyse_slice( h );
+ rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h );
rcc->short_term_cplxsum *= 0.5;
rcc->short_term_cplxcount *= 0.5;
rcc->short_term_cplxsum += rcc->last_satd;
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/encoder/slicetype.c
^
|
@@ -489,7 +489,7 @@
if( !h->frames.last_nonb )
return;
frames[0] = h->frames.last_nonb;
- for( j = 0; h->frames.next[j]; j++ )
+ for( j = 0; h->frames.next[j] && h->frames.next[j]->i_type == X264_TYPE_AUTO; j++ )
frames[j+1] = h->frames.next[j];
keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1;
num_frames = X264_MIN( j, keyint_limit );
@@ -630,10 +630,8 @@
frm->i_type = X264_TYPE_P;
}
- if( frm->i_type != X264_TYPE_AUTO && frm->i_type != X264_TYPE_B && frm->i_type != X264_TYPE_BREF )
- break;
-
- frm->i_type = X264_TYPE_B;
+ if( frm->i_type == X264_TYPE_AUTO ) frm->i_type = X264_TYPE_B;
+ else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
}
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/muxers.c
^
|
@@ -290,7 +290,7 @@
header[slen] = 0;
if (strncmp(header, Y4M_FRAME_MAGIC, slen))
{
- fprintf(stderr, "Bad header magic (%08X <=> %s)\n",
+ fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n",
*((uint32_t*)header), header);
return -1;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/tools/checkasm.c
^
|
@@ -156,7 +156,8 @@
b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
- b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "",
+ b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
+ b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
@@ -330,16 +331,15 @@
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
- uint32_t res_c, res_asm; \
- uint32_t sad_c, sad_asm; \
+ int res_c, res_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
- res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
- res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
- if( (res_c != res_asm) || (sad_c != sad_asm) ) \
+ res_c = call_c( pixel_c.var[i], buf1, 16 ); \
+ res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+ if( res_c != res_asm ) \
{ \
ok = 0; \
- fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
+ fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
} \
}
@@ -408,8 +408,8 @@
int sums[5][4] = {{0}};
used_asm = ok = 1;
x264_emms();
- res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 );
- res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
+ res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
+ res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
if( fabs(res_c - res_a) > 1e-6 )
{
ok = 0;
@@ -793,12 +793,13 @@
uint8_t *src = buf1+8+2*64;
uint8_t *dstc[3] = { buf3+8, buf3+8+16*64, buf3+8+32*64 };
uint8_t *dsta[3] = { buf4+8, buf4+8+16*64, buf4+8+32*64 };
+ void *tmp = buf3+49*64;
set_func_name( "hpel_filter" );
ok = 1; used_asm = 1;
memset( buf3, 0, 4096 );
memset( buf4, 0, 4096 );
- call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10 );
- call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10 );
+ call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10, tmp );
+ call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10, tmp );
for( i=0; i<3; i++ )
for( j=0; j<10; j++ )
//FIXME ideally the first pixels would match too, but they aren't actually used
@@ -822,33 +823,57 @@
uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
set_func_name( "lowres_init" );
+ ok = 1; used_asm = 1;
for( w=40; w<=48; w+=8 )
- if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
+ {
+ int stride = (w+8)&~15;
+ call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
+ call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+ for( i=0; i<16; i++)
{
- int stride = (w+8)&~15;
- used_asm = 1;
- call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
- call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
- for( i=0; i<16; i++)
- {
- for( j=0; j<4; j++)
- if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
- {
- ok = 0;
- fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
- for( k=0; k<w; k++ )
- printf( "%d ", dstc[j][k+i*stride] );
- printf("\n");
- for( k=0; k<w; k++ )
- printf( "%d ", dsta[j][k+i*stride] );
- printf("\n");
- break;
- }
- }
+ for( j=0; j<4; j++)
+ if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
+ {
+ ok = 0;
+ fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
+ for( k=0; k<w; k++ )
+ printf( "%d ", dstc[j][k+i*stride] );
+ printf("\n");
+ for( k=0; k<w; k++ )
+ printf( "%d ", dsta[j][k+i*stride] );
+ printf("\n");
+ break;
+ }
}
+ }
report( "lowres init :" );
}
+#define INTEGRAL_INIT( name, size, ... )\
+ if( mc_a.name != mc_ref.name )\
+ {\
+ int stride = 80;\
+ set_func_name( #name );\
+ used_asm = 1;\
+ memcpy( buf3, buf1, size*2*stride );\
+ memcpy( buf4, buf1, size*2*stride );\
+ uint16_t *sum = (uint16_t*)buf3;\
+ call_c1( mc_c.name, __VA_ARGS__ );\
+ sum = (uint16_t*)buf4;\
+ call_a1( mc_a.name, __VA_ARGS__ );\
+ if( memcmp( buf3, buf4, (stride-8)*2 )\
+ || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
+ ok = 0;\
+ call_c2( mc_c.name, __VA_ARGS__ );\
+ call_a2( mc_a.name, __VA_ARGS__ );\
+ }
+ ok = 1; used_asm = 0;
+ INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride );
+ INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride );
+ INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride );
+ INTEGRAL_INIT( integral_init8v, 9, sum, stride );
+ report( "integral init :" );
+
return ret;
}
@@ -1104,7 +1129,7 @@
ok = oks[1]; used_asm = used_asms[1];
report( "dequant :" );
- ok = 1;
+ ok = 1; used_asm = 0;
if( qf_a.denoise_dct != qf_ref.denoise_dct )
{
int size;
@@ -1137,21 +1162,18 @@
dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
if( ac ) \
dct1[0] = 0; \
- memcpy( dct2, dct1, w*w*2 ); \
- result_c = call_c1( qf_c.decname, (void*)dct2 ); \
- result_a = call_a1( qf_a.decname, (void*)dct2 ); \
+ result_c = call_c( qf_c.decname, (void*)dct1 ); \
+ result_a = call_a( qf_a.decname, (void*)dct1 ); \
if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
{ \
ok = 0; \
fprintf( stderr, #decname ": [FAILED]\n" ); \
break; \
} \
- call_c2( qf_c.decname, (void*)dct2 ); \
- call_a2( qf_a.decname, (void*)dct2 ); \
} \
}
- ok = 1;
+ ok = 1; used_asm = 0;
TEST_DECIMATE( decimate_score64, 8, 0, 6 );
TEST_DECIMATE( decimate_score16, 4, 0, 6 );
TEST_DECIMATE( decimate_score15, 4, 1, 7 );
@@ -1171,27 +1193,60 @@
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
- memcpy( dct2, dct1, w*w*2 ); \
- result_c = call_c1( qf_c.last, (void*)(dct2+ac) ); \
- result_a = call_a1( qf_a.last, (void*)(dct2+ac) ); \
+ result_c = call_c( qf_c.last, (void*)(dct1+ac) ); \
+ result_a = call_a( qf_a.last, (void*)(dct1+ac) ); \
if( result_c != result_a ) \
{ \
ok = 0; \
fprintf( stderr, #lastname ": [FAILED]\n" ); \
break; \
} \
- call_c2( qf_c.last, (void*)(dct2+ac) ); \
- call_a2( qf_a.last, (void*)(dct2+ac) ); \
} \
}
- ok = 1;
+ ok = 1; used_asm = 0;
TEST_LAST( coeff_last[DCT_CHROMA_DC], coeff_last4, 2, 0 );
TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 4, 1 );
TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
report( "coeff_last :" );
+#define TEST_LEVELRUN( lastname, name, w, ac ) \
+ if( qf_a.lastname != qf_ref.lastname ) \
+ { \
+ set_func_name( #name ); \
+ used_asm = 1; \
+ for( i = 0; i < 100; i++ ) \
+ { \
+ x264_run_level_t runlevel_c, runlevel_a; \
+ int result_c, result_a, idx, nnz=0; \
+ int max = rand() & (w*w-1); \
+ memset( dct1, 0, w*w*2 ); \
+ memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
+ memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
+ for( idx = ac; idx < max; idx++ ) \
+ nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
+ if( !nnz ) \
+ dct1[ac] = 1; \
+ result_c = call_c( qf_c.lastname, (void*)(dct1+ac), &runlevel_c ); \
+ result_a = call_a( qf_a.lastname, (void*)(dct1+ac), &runlevel_a ); \
+ if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
+ memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
+ memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name ": [FAILED]\n" ); \
+ break; \
+ } \
+ } \
+ }
+
+ ok = 1; used_asm = 0;
+ TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC], coeff_level_run4, 2, 0 );
+ TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 4, 1 );
+ TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 );
+ report( "coeff_level_run :" );
+
return ret;
}
@@ -1338,6 +1393,11 @@
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
+ if( x264_cpu_detect() & X264_CPU_LZCNT )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
+ cpu1 &= ~X264_CPU_LZCNT;
+ }
}
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
@@ -1351,6 +1411,12 @@
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN;
}
+ if( x264_cpu_detect() & X264_CPU_LZCNT )
+ {
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
+ cpu1 &= ~X264_CPU_LZCNT;
+ }
if( x264_cpu_detect() & X264_CPU_SSE3 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 )
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/x264.c
^
|
@@ -220,7 +220,9 @@
" where <option> is either\n"
" q=<integer> (force QP)\n"
" or b=<float> (bitrate multiplier)\n" );
- H1( " --qpfile <string> Force frametypes and QPs\n" );
+ H1( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
+ " Format of each line: framenumber frametype QP\n"
+ " QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
H0( "\n" );
H0( "Analysis:\n" );
H0( "\n" );
@@ -563,8 +565,6 @@
fprintf( stderr, "x264 [error]: can't open `%s'\n", optarg );
return -1;
}
- param->i_scenecut_threshold = -1;
- param->i_bframe_adaptive = X264_B_ADAPT_NONE;
break;
case OPT_THREAD_INPUT:
b_thread_input = 1;
|
[-]
[+]
|
Changed |
x264-snapshot-20090119-2245.tar.bz2/x264.h
^
|
@@ -62,6 +62,7 @@
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
/* Analyse flags
*/
@@ -341,7 +342,11 @@
typedef struct
{
- /* In: force picture type (if not auto) XXX: ignored for now
+ /* In: force picture type (if not auto)
+ * If x264 encoding parameters are violated in the forcing of picture types,
+ * x264 will correct the input picture type and log a warning.
+ * The quality of frametype decisions may suffer if a great deal of fine-grained
+ * mixing of auto and forced frametypes is done.
* Out: type of the picture encoded */
int i_type;
/* In: force quantizer for > 0 */
|
|
Changed |
x264-snapshot-20090228-2245.tar.bz2
^
|
|
Changed |
x264-snapshot-20090627-2245.tar.bz2
^
|